http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java new file mode 100644 index 0000000..b1762e6 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Test extraction of image metadata + */ +public class TestImageMetadata { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + private String[] sampleFiles = { "nutch_logo_tm.gif", }; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + Assert.assertEquals("121", parse.getData().getMeta("width")); + Assert.assertEquals("48", parse.getData().getMeta("height")); + } + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java new file mode 100644 index 0000000..576b3df --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; + +/** + * Unit tests for MSWordParser. + * + * @author John Xing + */ +public class TestMSWordParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-msword/build.xml during plugin compilation. + // Check ./src/plugin/parse-msword/sample/README.txt for what they are. + private String[] sampleFiles = { "word97.doc" }; + + private String expectedText = "This is a sample doc file prepared for nutch."; + + private Configuration conf; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } + + public String getTextContent(String fileName) throws ProtocolException, + ParseException { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + return parse.getText(); + } + + @Test + public void testIt() throws ProtocolException, ParseException { + for (int i = 0; i < sampleFiles.length; i++) { + String found = getTextContent(sampleFiles[i]); + Assert.assertTrue("text found : '" + found + "'", + found.startsWith(expectedText)); + } + } + + @Test + public void testOpeningDocs() throws ProtocolException, ParseException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + if (filenames[i].endsWith(".doc") == false) + continue; + Assert.assertTrue("cann't read content of " + filenames[i], + getTextContent(filenames[i]).length() > 0); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java new file mode 100644 index 0000000..6960bad --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import java.io.FileInputStream; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.protocol.*; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for OOParser. + * + * @author Andrzej Bialecki + */ +public class TestOOParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-oo/build.xml during plugin compilation. + private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; + + private String expectedText; + + private String sampleText = "ootest.txt"; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Content content; + Parse parse; + Configuration conf = NutchConfiguration.create(); + Protocol protocol; + ProtocolFactory factory = new ProtocolFactory(conf); + + System.out.println("Expected : " + expectedText); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + if (sampleFiles[i].startsWith("ootest") == false) + continue; + + protocol = factory.getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + + // simply test for the presence of a text - the ordering of the elements + // may differ from what was expected + // in the previous tests + Assert.assertTrue(text != null && text.length() > 0); + + System.out.println("Found " + sampleFiles[i] + ": " + text); + } + } + + public TestOOParser() { + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleText); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + expectedText = sb.toString(); + // normalize space + expectedText = expectedText.replaceAll("[ \t\r\n]+", " "); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java new file mode 100644 index 0000000..9884f0c --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for PdfParser. + * + * @author John Xing + */ +public class TestPdfParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-pdf/build.xml during plugin compilation. + // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. + private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; + + private String expectedText = "A VERY SMALL PDF FILE"; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + int index = parse.getText().indexOf(expectedText); + Assert.assertTrue(index > 0); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java new file mode 100644 index 0000000..f15d821 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.tika; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). + * + * @author Andy Hedges + */ +public class TestRTFParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-rtf/build.xml during plugin compilation. + // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. + private String rtfFile = "test.rtf"; + + @Ignore("There seems to be an issue with line 71 e.g. text.trim()") + @Test + public void testIt() throws ProtocolException, ParseException { + + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + urlString = "file:" + sampleDir + fileSeparator + rtfFile; + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( + content.getUrl()); + String text = parse.getText(); + Assert.assertEquals("The quick brown fox jumps over the lazy dog", + text.trim()); + + String title = parse.getData().getTitle(); + Metadata meta = parse.getData().getParseMeta(); + + Assert.assertEquals("test rft document", title); + Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT)); + + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java new file mode 100644 index 0000000..4224f93 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.tika.HTMLMetaProcessor; + +import java.io.ByteArrayInputStream; +import java.net.URL; + +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for HTMLMetaProcessor. */ +public class TestRobotsMetaProcessor { + + /* + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> + */ + + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; + + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! + }; + + private URL[][] currURLsAndAnswers; + + @Test + public void testRobotsMetaProcessor() { + DOMFragmentParser parser = new DOMFragmentParser(); + ; + + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; + } catch (Exception e) { + Assert.assertTrue("couldn't make test URLs!", false); + } + + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); + + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + + try { + parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); + } catch (Exception e) { + e.printStackTrace(); + } + + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); + + Assert.assertTrue("got index wrong on test " + i, + robotsMeta.getNoIndex() == answers[i][0]); + Assert.assertTrue("got follow wrong on test " + i, + robotsMeta.getNoFollow() == answers[i][1]); + Assert.assertTrue("got cache wrong on test " + i, + robotsMeta.getNoCache() == answers[i][2]); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); + + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/build.xml b/nutch-plugins/parse-zip/build.xml new file mode 100644 index 0000000..991ce31 --- /dev/null +++ b/nutch-plugins/parse-zip/build.xml @@ -0,0 +1,38 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-zip" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <ant target="deploy" inheritall="false" dir="../protocol-file"/> + <!-- <ant target="deploy" inheritall="false" dir="../parse-text"/>--> + </target> + + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.zip" /> + </fileset> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/ivy.xml b/nutch-plugins/parse-zip/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/parse-zip/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/plugin.xml b/nutch-plugins/parse-zip/plugin.xml new file mode 100644 index 0000000..35ec0eb --- /dev/null +++ b/nutch-plugins/parse-zip/plugin.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-zip" + name="Zip Parse Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-zip.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.parse.zip" + name="ZipParser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.zip.ZipParser" + class="org.apache.nutch.parse.zip.ZipParser"> + <parameter name="contentType" value="application/zip"/> + <parameter name="pathSuffix" value="zip"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/pom.xml b/nutch-plugins/parse-zip/pom.xml new file mode 100644 index 0000000..b30b9a1 --- /dev/null +++ b/nutch-plugins/parse-zip/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-zip</artifactId> + <packaging>jar</packaging> + + <name>parse-zip</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/sample/test.zip ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/sample/test.zip b/nutch-plugins/parse-zip/sample/test.zip new file mode 100644 index 0000000..0c649d2 Binary files /dev/null and b/nutch-plugins/parse-zip/sample/test.zip differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java new file mode 100644 index 0000000..f441fd0 --- /dev/null +++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java @@ -0,0 +1,144 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.zip; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.conf.Configuration; + +/** + * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter. + * Nutch parse plugin for zip files - Content Type : application/zip + */ +public class ZipParser implements Parser { + + private static final Logger LOG = LoggerFactory.getLogger(ZipParser.class); + private Configuration conf; + + /** Creates a new instance of ZipParser */ + public ZipParser() { + } + + public ParseResult getParse(final Content content) { + + String resultText = null; + String resultTitle = null; + Outlink[] outlinks = null; + List<Outlink> outLinksList = new ArrayList<Outlink>(); + + try { + final String contentLen = content.getMetadata().get( + Response.CONTENT_LENGTH); + final int len = Integer.parseInt(contentLen); + if (LOG.isDebugEnabled()) { + LOG.debug("ziplen: " + len); + } + final byte[] contentInBytes = content.getContent(); + + if (contentLen != null && contentInBytes.length != len) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, "Content truncated at " + + contentInBytes.length + + " bytes. Parser can't handle incomplete zip file.") + .getEmptyParseResult(content.getUrl(), getConf()); + } + + ZipTextExtractor extractor = new ZipTextExtractor(getConf()); + + // extract text + resultText = extractor.extractText(new ByteArrayInputStream( + contentInBytes), content.getUrl(), outLinksList); + + } catch (Exception e) { + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as Zip document. " + e).getEmptyParseResult( + content.getUrl(), getConf()); + } + + if (resultText == null) { + resultText = ""; + } + + if (resultTitle == null) { + resultTitle = ""; + } + + outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); + final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, + resultTitle, outlinks, content.getMetadata()); + + if (LOG.isTraceEnabled()) { + LOG.trace("Zip file parsed sucessfully !!"); + } + return ParseResult.createParseResult(content.getUrl(), new ParseImpl( + resultText, parseData)); + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.out.println("ZipParser <zip_file>"); + System.exit(1); + } + File file = new File(args[0]); + String url = "file:"+file.getCanonicalPath(); + FileInputStream in = new FileInputStream(file); + byte[] bytes = new byte[in.available()]; + in.read(bytes); + in.close(); + Configuration conf = NutchConfiguration.create(); + ZipParser parser = new ZipParser(); + parser.setConf(conf); + Metadata meta = new Metadata(); + meta.add(Response.CONTENT_LENGTH, ""+file.length()); + ParseResult parseResult = parser.getParse(new Content(url, url, bytes, + "application/zip", meta, conf)); + Parse p = parseResult.get(url); + System.out.println(parseResult.size()); + System.out.println("Parse Text:"); + System.out.println(p.getText()); + System.out.println("Parse Data:"); + System.out.println(p.getData()); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java new file mode 100644 index 0000000..b454727 --- /dev/null +++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.zip; + +// JDK imports +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.net.URL; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.protocol.Content; +import org.apache.tika.Tika; + +/** + * + * @author Rohit Kulkarni & Ashish Vaidya + */ +public class ZipTextExtractor { + + public static final Logger LOG = LoggerFactory + .getLogger(ZipTextExtractor.class); + + private Configuration conf; + + /** Creates a new instance of ZipTextExtractor */ + public ZipTextExtractor(Configuration conf) { + this.conf = conf; + } + + public String extractText(InputStream input, String url, + List<Outlink> outLinksList) throws IOException { + String resultText = ""; + ZipInputStream zin = new ZipInputStream(input); + ZipEntry entry; + + while ((entry = zin.getNextEntry()) != null) { + + if (!entry.isDirectory()) { + int size = (int) entry.getSize(); + byte[] b = new byte[size]; + for (int x = 0; x < size; x++) { + int err = zin.read(); + if (err != -1) { + b[x] = (byte) err; + } + } + String newurl = url + "/"; + String fname = entry.getName(); + newurl += fname; + URL aURL = new URL(newurl); + String base = aURL.toString(); + int i = fname.lastIndexOf('.'); + if (i != -1) { + // Trying to resolve the Mime-Type + Tika tika = new Tika(); + String contentType = tika.detect(fname); + try { + Metadata metadata = new Metadata(); + metadata.set(Response.CONTENT_LENGTH, + Long.toString(entry.getSize())); + metadata.set(Response.CONTENT_TYPE, contentType); + Content content = new Content(newurl, base, b, contentType, + metadata, this.conf); + Parse parse = new ParseUtil(this.conf).parse(content).get( + content.getUrl()); + ParseData theParseData = parse.getData(); + Outlink[] theOutlinks = theParseData.getOutlinks(); + + for (int count = 0; count < theOutlinks.length; count++) { + outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), + theOutlinks[count].getAnchor())); + } + + resultText += entry.getName() + " " + parse.getText() + " "; + } catch (ParseException e) { + if (LOG.isInfoEnabled()) { + LOG.info("fetch okay, but can't parse " + fname + ", reason: " + + e.getMessage()); + } + } + } + } + } + + return resultText; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java new file mode 100644 index 0000000..fc81ee1 --- /dev/null +++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse ZIP files: embedded files are recursively passed to appropriate parsers. + */ +package org.apache.nutch.parse.zip; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java b/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java new file mode 100644 index 0000000..17e386a --- /dev/null +++ b/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.zip; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Based on Unit tests for MSWordParser by John Xing + * + * @author Rohit Kulkarni & Ashish Vaidya + */ +public class TestZipParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" + + private String[] sampleFiles = { "test.zip" }; + + private String expectedText = "textfile.txt This is text file number 1 "; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get( + content.getUrl()); + Assert.assertTrue(parse.getText().equals(expectedText)); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/build-ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/build-ivy.xml b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml new file mode 100644 index 0000000..22bee5f --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parsefilter-naivebayes" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/build.xml b/nutch-plugins/parsefilter-naivebayes/build.xml new file mode 100644 index 0000000..6fb7a9d --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parsefilter-naivebayes" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/ivy.xml b/nutch-plugins/parsefilter-naivebayes/ivy.xml new file mode 100644 index 0000000..08cca2c --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/ivy.xml @@ -0,0 +1,49 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + + <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" /> + <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" > + <exclude org="org.apache.mrunit" name="mrunit"/> + </dependency> + <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" /> + <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" /> + + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/plugin.xml b/nutch-plugins/parsefilter-naivebayes/plugin.xml new file mode 100644 index 0000000..ac15041 --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/plugin.xml @@ -0,0 +1,56 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parsefilter-naivebayes" + name="Naive Bayes Parse Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parsefilter-naivebayes.jar"> + <export name="*"/> + </library> + <library name="commons-cli-2.0-mahout.jar"/> + <library name="commons-lang3-3.1.jar"/> + <library name="commons-math3-3.2.jar"/> + <library name="guava-14.0.1.jar"/> + <library name="jackson-core-asl-1.9.12.jar"/> + <library name="jackson-mapper-asl-1.9.12.jar"/> + <library name="lucene-analyzers-common-5.5.0.jar"/> + <library name="lucene-core-5.5.0.jar"/> + <library name="mahout-core-0.9.jar"/> + <library name="mahout-math-0.10.1.jar"/> + <library name="slf4j-api-1.7.12.jar"/> + <library name="solr-commons-csv-3.5.0.jar"/> + <library name="t-digest-3.1.jar"/> + <library name="xmlpull-1.1.3.1.jar"/> + <library name="xpp3_min-1.1.4c.jar"/> + <library name="xstream-1.4.4.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.htmlparsefilter.naivebayes" + name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="NaiveBayesHTMLParseFilter" + class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/pom.xml b/nutch-plugins/parsefilter-naivebayes/pom.xml new file mode 100644 index 0000000..0a99e47 --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parsefilter-naivebayes</artifactId> + <packaging>jar</packaging> + + <name>parsefilter-naivebayes</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java new file mode 100644 index 0000000..d755ff6 --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parsefilter.naivebayes; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +public class Classify { + + private static int uniquewords_size = 0; + + private static int numof_ir = 0; + private static int numwords_ir = 0; + private static HashMap<String, Integer> wordfreq_ir = null; + + private static int numof_r = 0; + private static int numwords_r = 0; + private static HashMap<String, Integer> wordfreq_r = null; + private static boolean ismodel = false; + + public static HashMap<String, Integer> unflattenToHashmap(String line) { + HashMap<String, Integer> dict = new HashMap<String, Integer>(); + + String dictarray[] = line.split(","); + + for (String field : dictarray) { + + dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1])); + } + + return dict; + + } + + public static String classify(String line) throws IOException { + + double prob_ir = 0; + double prob_r = 0; + + String result = "1"; + + String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase() + .split(" "); + + // read the training file + // read the line + if (!ismodel) { + Configuration configuration = new Configuration(); + FileSystem fs = FileSystem.get(configuration); + + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader( + fs.open(new Path("naivebayes-model")))); + + uniquewords_size = Integer.valueOf(bufferedReader.readLine()); + bufferedReader.readLine(); + + numof_ir = Integer.valueOf(bufferedReader.readLine()); + numwords_ir = Integer.valueOf(bufferedReader.readLine()); + wordfreq_ir = unflattenToHashmap(bufferedReader.readLine()); + bufferedReader.readLine(); + numof_r = Integer.valueOf(bufferedReader.readLine()); + numwords_r = Integer.valueOf(bufferedReader.readLine()); + wordfreq_r = unflattenToHashmap(bufferedReader.readLine()); + + ismodel = true; + + bufferedReader.close(); + + } + + // update probabilities + + for (String word : linearray) { + if (wordfreq_ir.containsKey(word)) + prob_ir += Math.log(wordfreq_ir.get(word)) + 1 + - Math.log(numwords_ir + uniquewords_size); + else + prob_ir += 1 - Math.log(numwords_ir + uniquewords_size); + + if (wordfreq_r.containsKey(word)) + prob_r += Math.log(wordfreq_r.get(word)) + 1 + - Math.log(numwords_r + uniquewords_size); + else + prob_r += 1 - Math.log(numwords_r + uniquewords_size); + + } + + prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r); + prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r); + + if (prob_ir > prob_r) + result = "0"; + else + result = "1"; + + return result; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java new file mode 100644 index 0000000..30810ae --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java @@ -0,0 +1,197 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parsefilter.naivebayes; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; + +import java.io.Reader; +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; + +/** + * Html Parse filter that classifies the outlinks from the parseresult as + * relevant or irrelevant based on the parseText's relevancy (using a training + * file where you can give positive and negative example texts see the + * description of parsefilter.naivebayes.trainfile) and if found irrelevant it + * gives the link a second chance if it contains any of the words from the list + * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to + * -1 or a bigger value than 30, when using this classifier. + */ +public class NaiveBayesParseFilter implements HtmlParseFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(NaiveBayesParseFilter.class); + + public static final String TRAINFILE_MODELFILTER = "parsefilter.naivebayes.trainfile"; + public static final String DICTFILE_MODELFILTER = "parsefilter.naivebayes.wordlist"; + + private Configuration conf; + private String inputFilePath; + private String dictionaryFile; + private ArrayList<String> wordlist = new ArrayList<String>(); + + public boolean filterParse(String text) { + + try { + return classify(text); + } catch (IOException e) { + LOG.error("Error occured while classifying:: " + text + " ::" + + StringUtils.stringifyException(e)); + } + + return false; + } + + public boolean filterUrl(String url) { + + return containsWord(url, wordlist); + + } + + public boolean classify(String text) throws IOException { + + // if classified as relevant "1" then return true + if (Classify.classify(text).equals("1")) + return true; + return false; + } + + public void train() throws Exception { + // check if the model file exists, if it does then don't train + if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) { + LOG.info("Training the Naive Bayes Model"); + Train.start(inputFilePath); + } else { + LOG.info("Model file already exists. Skipping training."); + } + } + + public boolean containsWord(String url, ArrayList<String> wordlist) { + for (String word : wordlist) { + if (url.contains(word)) { + return true; + } + } + + return false; + } + + public void setConf(Configuration conf) { + this.conf = conf; + inputFilePath = conf.get(TRAINFILE_MODELFILTER); + dictionaryFile = conf.get(DICTFILE_MODELFILTER); + if (inputFilePath == null || inputFilePath.trim().length() == 0 + || dictionaryFile == null || dictionaryFile.trim().length() == 0) { + String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist"; + if (LOG.isErrorEnabled()) { + LOG.error(message); + } + throw new IllegalArgumentException(message); + } + try { + if ((FileSystem.get(conf).exists(new Path(inputFilePath))) + || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) { + String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or " + + dictionaryFile + " not found!"; + if (LOG.isErrorEnabled()) { + LOG.error(message); + } + throw new IllegalArgumentException(message); + } + + BufferedReader br = null; + + String CurrentLine; + Reader reader = conf.getConfResourceAsReader(dictionaryFile); + br = new BufferedReader(reader); + while ((CurrentLine = br.readLine()) != null) { + wordlist.add(CurrentLine); + } + + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + } + try { + train(); + } catch (Exception e) { + + LOG.error("Error occured while training:: " + + StringUtils.stringifyException(e)); + + } + + } + + public Configuration getConf() { + return this.conf; + } + + @Override + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + Parse parse = parseResult.get(content.getUrl()); + + String url = content.getBaseUrl(); + ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>(); + String text = parse.getText(); + + if (!filterParse(text)) { // kick in the second tier + // if parent page found + // irrelevant + LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url); + LOG.info("Checking outlinks"); + + Outlink[] out = null; + for (int i = 0; i < parse.getData().getOutlinks().length; i++) { + LOG.info("ParseFilter: NaiveBayes: Outlink to check:: " + + parse.getData().getOutlinks()[i].getToUrl()); + if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) { + tempOutlinks.add(parse.getData().getOutlinks()[i]); + LOG.info("ParseFilter: NaiveBayes: found relevant"); + + } else { + LOG.info("ParseFilter: NaiveBayes: found irrelevant"); + } + } + out = new Outlink[tempOutlinks.size()]; + for (int i = 0; i < tempOutlinks.size(); i++) { + out[i] = tempOutlinks.get(i); + } + parse.getData().setOutlinks(out); + + } else { + LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url); + } + + return parseResult; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java new file mode 100644 index 0000000..19a6911 --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parsefilter.naivebayes; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.HashMap; +import java.util.HashSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +public class Train { + + public static String replacefirstoccuranceof(String tomatch, String line) { + + int index = line.indexOf(tomatch); + if (index == -1) { + return line; + } else { + return line.substring(0, index) + + line.substring(index + tomatch.length()); + } + + } + + public static void updateHashMap(HashMap<String, Integer> dict, String key) { + if (!key.equals("")) { + if (dict.containsKey(key)) + dict.put(key, dict.get(key) + 1); + else + dict.put(key, 1); + } + } + + public static String flattenHashMap(HashMap<String, Integer> dict) { + String result = ""; + + for (String key : dict.keySet()) { + + result += key + ":" + dict.get(key) + ","; + } + + // remove the last comma + result = result.substring(0, result.length() - 1); + + return result; + } + + public static void start(String filepath) throws IOException { + + // two classes 0/irrelevant and 1/relevant + + // calculate the total number of instances/examples per class, word count in + // each class and for each class a word:frequency map + + int numof_ir = 0; + int numof_r = 0; + int numwords_ir = 0; + int numwords_r = 0; + HashSet<String> uniquewords = new HashSet<String>(); + HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>(); + HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>(); + + String line = ""; + String target = ""; + String[] linearray = null; + + // read the line + Configuration configuration = new Configuration(); + FileSystem fs = FileSystem.get(configuration); + + BufferedReader bufferedReader = new BufferedReader( + configuration.getConfResourceAsReader(filepath)); + + while ((line = bufferedReader.readLine()) != null) { + + target = line.split("\t")[0]; + + line = replacefirstoccuranceof(target + "\t", line); + + linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" "); + + // update the data structures + if (target.equals("0")) { + + numof_ir += 1; + numwords_ir += linearray.length; + for (int i = 0; i < linearray.length; i++) { + uniquewords.add(linearray[i]); + updateHashMap(wordfreq_ir, linearray[i]); + } + } else { + + numof_r += 1; + numwords_r += linearray.length; + for (int i = 0; i < linearray.length; i++) { + uniquewords.add(linearray[i]); + updateHashMap(wordfreq_r, linearray[i]); + } + + } + + } + + // write the model file + + Path path = new Path("naivebayes-model"); + + Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, + true))); + + writer.write(String.valueOf(uniquewords.size()) + "\n"); + writer.write("0\n"); + writer.write(String.valueOf(numof_ir) + "\n"); + writer.write(String.valueOf(numwords_ir) + "\n"); + writer.write(flattenHashMap(wordfreq_ir) + "\n"); + writer.write("1\n"); + writer.write(String.valueOf(numof_r) + "\n"); + writer.write(String.valueOf(numwords_r) + "\n"); + writer.write(flattenHashMap(wordfreq_r) + "\n"); + + writer.close(); + + bufferedReader.close(); + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java new file mode 100644 index 0000000..6a892be --- /dev/null +++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Html Parse filter that classifies the outlinks from the parseresult as + * relevant or irrelevant based on the parseText's relevancy (using a training + * file where you can give positive and negative example texts see the + * description of parsefilter.naivebayes.trainfile) and if found irrelevent + * it gives the link a second chance if it contains any of the words from the + * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the + * parser.timeout to -1 or a bigger value than 30, when using this classifier. + */ +package org.apache.nutch.parsefilter.naivebayes; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/build.xml b/nutch-plugins/parsefilter-regex/build.xml new file mode 100644 index 0000000..14d1127 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parsefilter-regex" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt new file mode 100644 index 0000000..9d15cd8 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt @@ -0,0 +1,10 @@ +# Example configuration file for parsefilter-regex +# +# Parse metadata field <name> is set to true if the HTML matches the regex. The +# source can either be html or text. If source is html, the regex is applied to +# the entire HTML tree. If source is text, the regex is applied to the +# extracted text. +# +# format: <name>\t<source>\t<regex>\n +first html h1 +second text blablabla http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/ivy.xml b/nutch-plugins/parsefilter-regex/ivy.xml new file mode 100644 index 0000000..ed4cbc3 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/ivy.xml @@ -0,0 +1,37 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/plugin.xml b/nutch-plugins/parsefilter-regex/plugin.xml new file mode 100644 index 0000000..0725492 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parsefilter-regex" + name="Regex Parse Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parsefilter-regex.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.htmlparsefilter.regex" + name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="RegexParseFilter" + class="org.apache.nutch.parsefilter.regex.RegexParseFilter"> + <parameter name="file" value="regex-parsefilter.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/pom.xml b/nutch-plugins/parsefilter-regex/pom.xml new file mode 100644 index 0000000..19b6452 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parsefilter-regex</artifactId> + <packaging>jar</packaging> + + <name>parsefilter-regex</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project>
