http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java deleted file mode 100644 index 576b3df..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.File; - -/** - * Unit tests for MSWordParser. - * - * @author John Xing - */ -public class TestMSWordParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-msword/build.xml during plugin compilation. - // Check ./src/plugin/parse-msword/sample/README.txt for what they are. - private String[] sampleFiles = { "word97.doc" }; - - private String expectedText = "This is a sample doc file prepared for nutch."; - - private Configuration conf; - - @Before - public void setUp() { - conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); - } - - public String getTextContent(String fileName) throws ProtocolException, - ParseException { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - return parse.getText(); - } - - @Test - public void testIt() throws ProtocolException, ParseException { - for (int i = 0; i < sampleFiles.length; i++) { - String found = getTextContent(sampleFiles[i]); - Assert.assertTrue("text found : '" + found + "'", - found.startsWith(expectedText)); - } - } - - @Test - public void testOpeningDocs() throws ProtocolException, ParseException { - String[] filenames = new File(sampleDir).list(); - for (int i = 0; i < filenames.length; i++) { - if (filenames[i].endsWith(".doc") == false) - continue; - Assert.assertTrue("cann't read content of " + filenames[i], - getTextContent(filenames[i]).length() > 0); - } - } -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java deleted file mode 100644 index 6960bad..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import java.io.FileInputStream; -import java.io.InputStreamReader; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.protocol.*; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * Unit tests for OOParser. - * - * @author Andrzej Bialecki - */ -public class TestOOParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-oo/build.xml during plugin compilation. - private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; - - private String expectedText; - - private String sampleText = "ootest.txt"; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Content content; - Parse parse; - Configuration conf = NutchConfiguration.create(); - Protocol protocol; - ProtocolFactory factory = new ProtocolFactory(conf); - - System.out.println("Expected : " + expectedText); - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - if (sampleFiles[i].startsWith("ootest") == false) - continue; - - protocol = factory.getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); - - // simply test for the presence of a text - the ordering of the elements - // may differ from what was expected - // in the previous tests - Assert.assertTrue(text != null && text.length() > 0); - - System.out.println("Found " + sampleFiles[i] + ": " + text); - } - } - - public TestOOParser() { - try { - // read the test string - FileInputStream fis = new FileInputStream(sampleDir + fileSeparator - + sampleText); - StringBuffer sb = new StringBuffer(); - int len = 0; - InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); - char[] buf = new char[1024]; - while ((len = isr.read(buf)) > 0) { - sb.append(buf, 0, len); - } - isr.close(); - expectedText = sb.toString(); - // normalize space - expectedText = expectedText.replaceAll("[ \t\r\n]+", " "); - } catch (Exception e) { - e.printStackTrace(); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java deleted file mode 100644 index 9884f0c..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Test; - -/** - * Unit tests for PdfParser. - * - * @author John Xing - */ -public class TestPdfParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-pdf/build.xml during plugin compilation. - // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. - private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; - - private String expectedText = "A VERY SMALL PDF FILE"; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - Configuration conf = NutchConfiguration.create(); - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - int index = parse.getText().indexOf(expectedText); - Assert.assertTrue(index > 0); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java deleted file mode 100644 index f15d821..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.tika; - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.DublinCore; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; - -/** - * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). - * - * @author Andy Hedges - */ -public class TestRTFParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-rtf/build.xml during plugin compilation. - // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. - private String rtfFile = "test.rtf"; - - @Ignore("There seems to be an issue with line 71 e.g. text.trim()") - @Test - public void testIt() throws ProtocolException, ParseException { - - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); - urlString = "file:" + sampleDir + fileSeparator + rtfFile; - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) - .getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( - content.getUrl()); - String text = parse.getText(); - Assert.assertEquals("The quick brown fox jumps over the lazy dog", - text.trim()); - - String title = parse.getData().getTitle(); - Metadata meta = parse.getData().getParseMeta(); - - Assert.assertEquals("test rft document", title); - Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT)); - - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java deleted file mode 100644 index 4224f93..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java +++ /dev/null @@ -1,156 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.tika.HTMLMetaProcessor; - -import java.io.ByteArrayInputStream; -import java.net.URL; - -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; -import org.cyberneko.html.parsers.DOMFragmentParser; -import org.junit.Assert; -import org.junit.Test; - -/** Unit tests for HTMLMetaProcessor. */ -public class TestRobotsMetaProcessor { - - /* - * - * some sample tags: - * - * <meta name="robots" content="index,follow"> <meta name="robots" - * content="noindex,follow"> <meta name="robots" content="index,nofollow"> - * <meta name="robots" content="noindex,nofollow"> - * - * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - */ - - public static String[] tests = { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" - + " some text" + "</body></html>", - - }; - - public static final boolean[][] answers = { { true, true, true }, // NONE - { false, false, true }, // all - { true, true, true }, // nOnE - { true, true, false }, // none - { true, true, false }, // noindex,nofollow - { true, false, false }, // noindex,follow - { false, true, false }, // index,nofollow - { false, false, false }, // index,follow - { false, false, false }, // missing! - }; - - private URL[][] currURLsAndAnswers; - - @Test - public void testRobotsMetaProcessor() { - DOMFragmentParser parser = new DOMFragmentParser(); - ; - - try { - currURLsAndAnswers = new URL[][] { - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/") }, - { new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/") } }; - } catch (Exception e) { - Assert.assertTrue("couldn't make test URLs!", false); - } - - for (int i = 0; i < tests.length; i++) { - byte[] bytes = tests[i].getBytes(); - - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - - try { - parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); - } catch (Exception e) { - e.printStackTrace(); - } - - HTMLMetaTags robotsMeta = new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); - - Assert.assertTrue("got index wrong on test " + i, - robotsMeta.getNoIndex() == answers[i][0]); - Assert.assertTrue("got follow wrong on test " + i, - robotsMeta.getNoFollow() == answers[i][1]); - Assert.assertTrue("got cache wrong on test " + i, - robotsMeta.getNoCache() == answers[i][2]); - Assert - .assertTrue( - "got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) - || ((robotsMeta.getBaseHref() != null) && robotsMeta - .getBaseHref().equals(currURLsAndAnswers[i][1]))); - - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/build.xml b/src/plugin/parse-zip/build.xml deleted file mode 100644 index 991ce31..0000000 --- a/src/plugin/parse-zip/build.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-zip" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <ant target="deploy" inheritall="false" dir="../protocol-file"/> - <!-- <ant target="deploy" inheritall="false" dir="../parse-text"/>--> - </target> - - - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="sample"> - <include name="*.zip" /> - </fileset> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/parse-zip/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/plugin.xml b/src/plugin/parse-zip/plugin.xml deleted file mode 100644 index 35ec0eb..0000000 --- a/src/plugin/parse-zip/plugin.xml +++ /dev/null @@ -1,46 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parse-zip" - name="Zip Parse Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="parse-zip.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.parse.zip" - name="ZipParser" - point="org.apache.nutch.parse.Parser"> - - <implementation id="org.apache.nutch.parse.zip.ZipParser" - class="org.apache.nutch.parse.zip.ZipParser"> - <parameter name="contentType" value="application/zip"/> - <parameter name="pathSuffix" value="zip"/> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/sample/test.zip ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/sample/test.zip b/src/plugin/parse-zip/sample/test.zip deleted file mode 100644 index 0c649d2..0000000 Binary files a/src/plugin/parse-zip/sample/test.zip and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java deleted file mode 100644 index f441fd0..0000000 --- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java +++ /dev/null @@ -1,144 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.zip; - -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.conf.Configuration; - -/** - * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter. - * Nutch parse plugin for zip files - Content Type : application/zip - */ -public class ZipParser implements Parser { - - private static final Logger LOG = LoggerFactory.getLogger(ZipParser.class); - private Configuration conf; - - /** Creates a new instance of ZipParser */ - public ZipParser() { - } - - public ParseResult getParse(final Content content) { - - String resultText = null; - String resultTitle = null; - Outlink[] outlinks = null; - List<Outlink> outLinksList = new ArrayList<Outlink>(); - - try { - final String contentLen = content.getMetadata().get( - Response.CONTENT_LENGTH); - final int len = Integer.parseInt(contentLen); - if (LOG.isDebugEnabled()) { - LOG.debug("ziplen: " + len); - } - final byte[] contentInBytes = content.getContent(); - - if (contentLen != null && contentInBytes.length != len) { - return new ParseStatus(ParseStatus.FAILED, - ParseStatus.FAILED_TRUNCATED, "Content truncated at " - + contentInBytes.length - + " bytes. Parser can't handle incomplete zip file.") - .getEmptyParseResult(content.getUrl(), getConf()); - } - - ZipTextExtractor extractor = new ZipTextExtractor(getConf()); - - // extract text - resultText = extractor.extractText(new ByteArrayInputStream( - contentInBytes), content.getUrl(), outLinksList); - - } catch (Exception e) { - return new ParseStatus(ParseStatus.FAILED, - "Can't be handled as Zip document. " + e).getEmptyParseResult( - content.getUrl(), getConf()); - } - - if (resultText == null) { - resultText = ""; - } - - if (resultTitle == null) { - resultTitle = ""; - } - - outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); - final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, - resultTitle, outlinks, content.getMetadata()); - - if (LOG.isTraceEnabled()) { - LOG.trace("Zip file parsed sucessfully !!"); - } - return ParseResult.createParseResult(content.getUrl(), new ParseImpl( - resultText, parseData)); - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } - - public static void main(String[] args) throws IOException { - if (args.length < 1) { - System.out.println("ZipParser <zip_file>"); - System.exit(1); - } - File file = new File(args[0]); - String url = "file:"+file.getCanonicalPath(); - FileInputStream in = new FileInputStream(file); - byte[] bytes = new byte[in.available()]; - in.read(bytes); - in.close(); - Configuration conf = NutchConfiguration.create(); - ZipParser parser = new ZipParser(); - parser.setConf(conf); - Metadata meta = new Metadata(); - meta.add(Response.CONTENT_LENGTH, ""+file.length()); - ParseResult parseResult = parser.getParse(new Content(url, url, bytes, - "application/zip", meta, conf)); - Parse p = parseResult.get(url); - System.out.println(parseResult.size()); - System.out.println("Parse Text:"); - System.out.println(p.getText()); - System.out.println("Parse Data:"); - System.out.println(p.getData()); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java deleted file mode 100644 index b454727..0000000 --- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.zip; - -// JDK imports -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; -import java.net.URL; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; - -// Nutch imports -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.protocol.Content; -import org.apache.tika.Tika; - -/** - * - * @author Rohit Kulkarni & Ashish Vaidya - */ -public class ZipTextExtractor { - - public static final Logger LOG = LoggerFactory - .getLogger(ZipTextExtractor.class); - - private Configuration conf; - - /** Creates a new instance of ZipTextExtractor */ - public ZipTextExtractor(Configuration conf) { - this.conf = conf; - } - - public String extractText(InputStream input, String url, - List<Outlink> outLinksList) throws IOException { - String resultText = ""; - ZipInputStream zin = new ZipInputStream(input); - ZipEntry entry; - - while ((entry = zin.getNextEntry()) != null) { - - if (!entry.isDirectory()) { - int size = (int) entry.getSize(); - byte[] b = new byte[size]; - for (int x = 0; x < size; x++) { - int err = zin.read(); - if (err != -1) { - b[x] = (byte) err; - } - } - String newurl = url + "/"; - String fname = entry.getName(); - newurl += fname; - URL aURL = new URL(newurl); - String base = aURL.toString(); - int i = fname.lastIndexOf('.'); - if (i != -1) { - // Trying to resolve the Mime-Type - Tika tika = new Tika(); - String contentType = tika.detect(fname); - try { - Metadata metadata = new Metadata(); - metadata.set(Response.CONTENT_LENGTH, - Long.toString(entry.getSize())); - metadata.set(Response.CONTENT_TYPE, contentType); - Content content = new Content(newurl, base, b, contentType, - metadata, this.conf); - Parse parse = new ParseUtil(this.conf).parse(content).get( - content.getUrl()); - ParseData theParseData = parse.getData(); - Outlink[] theOutlinks = theParseData.getOutlinks(); - - for (int count = 0; count < theOutlinks.length; count++) { - outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), - theOutlinks[count].getAnchor())); - } - - resultText += entry.getName() + " " + parse.getText() + " "; - } catch (ParseException e) { - if (LOG.isInfoEnabled()) { - LOG.info("fetch okay, but can't parse " + fname + ", reason: " - + e.getMessage()); - } - } - } - } - } - - return resultText; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java deleted file mode 100644 index fc81ee1..0000000 --- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse ZIP files: embedded files are recursively passed to appropriate parsers. - */ -package org.apache.nutch.parse.zip; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java deleted file mode 100644 index 17e386a..0000000 --- a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.zip; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Test; - -/** - * Based on Unit tests for MSWordParser by John Xing - * - * @author Rohit Kulkarni & Ashish Vaidya - */ -public class TestZipParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" - - private String[] sampleFiles = { "test.zip" }; - - private String expectedText = "textfile.txt This is text file number 1 "; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get( - content.getUrl()); - Assert.assertTrue(parse.getText().equals(expectedText)); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/build-ivy.xml b/src/plugin/parsefilter-naivebayes/build-ivy.xml deleted file mode 100644 index 22bee5f..0000000 --- a/src/plugin/parsefilter-naivebayes/build-ivy.xml +++ /dev/null @@ -1,54 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parsefilter-naivebayes" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="ivy.install.version" value="2.1.0" /> - <condition property="ivy.home" value="${env.IVY_HOME}"> - <isset property="env.IVY_HOME" /> - </condition> - <property name="ivy.home" value="${user.home}/.ant" /> - <property name="ivy.checksums" value="" /> - <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> - - <target name="download-ivy" unless="offline"> - - <mkdir dir="${ivy.jar.dir}"/> - <!-- download Ivy from web site so that it can be used even without any special installation --> - <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" - dest="${ivy.jar.file}" usetimestamp="true"/> - </target> - - <target name="init-ivy" depends="download-ivy"> - <!-- try to load ivy here from ivy home, in case the user has not already dropped - it into ant's lib dir (note that the latter copy will always take precedence). - We will not fail as long as local lib dir exists (it may be empty) and - ivy is in at least one of ant's lib dir or the local lib dir. --> - <path id="ivy.lib.path"> - <fileset dir="${ivy.jar.dir}" includes="*.jar"/> - - </path> - <taskdef resource="org/apache/ivy/ant/antlib.xml" - uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> - </target> - - <target name="deps-jar" depends="init-ivy"> - <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/build.xml b/src/plugin/parsefilter-naivebayes/build.xml deleted file mode 100644 index 6fb7a9d..0000000 --- a/src/plugin/parsefilter-naivebayes/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parsefilter-naivebayes" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml deleted file mode 100644 index 08cca2c..0000000 --- a/src/plugin/parsefilter-naivebayes/ivy.xml +++ /dev/null @@ -1,49 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - - <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" /> - <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" > - <exclude org="org.apache.mrunit" name="mrunit"/> - </dependency> - <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" /> - <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" /> - - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml b/src/plugin/parsefilter-naivebayes/plugin.xml deleted file mode 100644 index ac15041..0000000 --- a/src/plugin/parsefilter-naivebayes/plugin.xml +++ /dev/null @@ -1,56 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parsefilter-naivebayes" - name="Naive Bayes Parse Filter" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="parsefilter-naivebayes.jar"> - <export name="*"/> - </library> - <library name="commons-cli-2.0-mahout.jar"/> - <library name="commons-lang3-3.1.jar"/> - <library name="commons-math3-3.2.jar"/> - <library name="guava-14.0.1.jar"/> - <library name="jackson-core-asl-1.9.12.jar"/> - <library name="jackson-mapper-asl-1.9.12.jar"/> - <library name="lucene-analyzers-common-5.5.0.jar"/> - <library name="lucene-core-5.5.0.jar"/> - <library name="mahout-core-0.9.jar"/> - <library name="mahout-math-0.10.1.jar"/> - <library name="slf4j-api-1.7.12.jar"/> - <library name="solr-commons-csv-3.5.0.jar"/> - <library name="t-digest-3.1.jar"/> - <library name="xmlpull-1.1.3.1.jar"/> - <library name="xpp3_min-1.1.4c.jar"/> - <library name="xstream-1.4.4.jar"/> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.htmlparsefilter.naivebayes" - name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="NaiveBayesHTMLParseFilter" - class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java deleted file mode 100644 index d755ff6..0000000 --- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parsefilter.naivebayes; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; -import java.io.InputStreamReader; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -public class Classify { - - private static int uniquewords_size = 0; - - private static int numof_ir = 0; - private static int numwords_ir = 0; - private static HashMap<String, Integer> wordfreq_ir = null; - - private static int numof_r = 0; - private static int numwords_r = 0; - private static HashMap<String, Integer> wordfreq_r = null; - private static boolean ismodel = false; - - public static HashMap<String, Integer> unflattenToHashmap(String line) { - HashMap<String, Integer> dict = new HashMap<String, Integer>(); - - String dictarray[] = line.split(","); - - for (String field : dictarray) { - - dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1])); - } - - return dict; - - } - - public static String classify(String line) throws IOException { - - double prob_ir = 0; - double prob_r = 0; - - String result = "1"; - - String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase() - .split(" "); - - // read the training file - // read the line - if (!ismodel) { - Configuration configuration = new Configuration(); - FileSystem fs = FileSystem.get(configuration); - - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader( - fs.open(new Path("naivebayes-model")))); - - uniquewords_size = Integer.valueOf(bufferedReader.readLine()); - bufferedReader.readLine(); - - numof_ir = Integer.valueOf(bufferedReader.readLine()); - numwords_ir = Integer.valueOf(bufferedReader.readLine()); - wordfreq_ir = unflattenToHashmap(bufferedReader.readLine()); - bufferedReader.readLine(); - numof_r = Integer.valueOf(bufferedReader.readLine()); - numwords_r = Integer.valueOf(bufferedReader.readLine()); - wordfreq_r = unflattenToHashmap(bufferedReader.readLine()); - - ismodel = true; - - bufferedReader.close(); - - } - - // update probabilities - - for (String word : linearray) { - if (wordfreq_ir.containsKey(word)) - prob_ir += Math.log(wordfreq_ir.get(word)) + 1 - - Math.log(numwords_ir + uniquewords_size); - else - prob_ir += 1 - Math.log(numwords_ir + uniquewords_size); - - if (wordfreq_r.containsKey(word)) - prob_r += Math.log(wordfreq_r.get(word)) + 1 - - Math.log(numwords_r + uniquewords_size); - else - prob_r += 1 - Math.log(numwords_r + uniquewords_size); - - } - - prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r); - prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r); - - if (prob_ir > prob_r) - result = "0"; - else - result = "1"; - - return result; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java deleted file mode 100644 index 30810ae..0000000 --- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java +++ /dev/null @@ -1,197 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parsefilter.naivebayes; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.StringUtils; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; - -import java.io.Reader; -import java.io.BufferedReader; -import java.io.IOException; -import java.util.ArrayList; - -/** - * Html Parse filter that classifies the outlinks from the parseresult as - * relevant or irrelevant based on the parseText's relevancy (using a training - * file where you can give positive and negative example texts see the - * description of parsefilter.naivebayes.trainfile) and if found irrelevant it - * gives the link a second chance if it contains any of the words from the list - * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to - * -1 or a bigger value than 30, when using this classifier. - */ -public class NaiveBayesParseFilter implements HtmlParseFilter { - - private static final Logger LOG = LoggerFactory - .getLogger(NaiveBayesParseFilter.class); - - public static final String TRAINFILE_MODELFILTER = "parsefilter.naivebayes.trainfile"; - public static final String DICTFILE_MODELFILTER = "parsefilter.naivebayes.wordlist"; - - private Configuration conf; - private String inputFilePath; - private String dictionaryFile; - private ArrayList<String> wordlist = new ArrayList<String>(); - - public boolean filterParse(String text) { - - try { - return classify(text); - } catch (IOException e) { - LOG.error("Error occured while classifying:: " + text + " ::" - + StringUtils.stringifyException(e)); - } - - return false; - } - - public boolean filterUrl(String url) { - - return containsWord(url, wordlist); - - } - - public boolean classify(String text) throws IOException { - - // if classified as relevant "1" then return true - if (Classify.classify(text).equals("1")) - return true; - return false; - } - - public void train() throws Exception { - // check if the model file exists, if it does then don't train - if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) { - LOG.info("Training the Naive Bayes Model"); - Train.start(inputFilePath); - } else { - LOG.info("Model file already exists. Skipping training."); - } - } - - public boolean containsWord(String url, ArrayList<String> wordlist) { - for (String word : wordlist) { - if (url.contains(word)) { - return true; - } - } - - return false; - } - - public void setConf(Configuration conf) { - this.conf = conf; - inputFilePath = conf.get(TRAINFILE_MODELFILTER); - dictionaryFile = conf.get(DICTFILE_MODELFILTER); - if (inputFilePath == null || inputFilePath.trim().length() == 0 - || dictionaryFile == null || dictionaryFile.trim().length() == 0) { - String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist"; - if (LOG.isErrorEnabled()) { - LOG.error(message); - } - throw new IllegalArgumentException(message); - } - try { - if ((FileSystem.get(conf).exists(new Path(inputFilePath))) - || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) { - String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or " - + dictionaryFile + " not found!"; - if (LOG.isErrorEnabled()) { - LOG.error(message); - } - throw new IllegalArgumentException(message); - } - - BufferedReader br = null; - - String CurrentLine; - Reader reader = conf.getConfResourceAsReader(dictionaryFile); - br = new BufferedReader(reader); - while ((CurrentLine = br.readLine()) != null) { - wordlist.add(CurrentLine); - } - - } catch (IOException e) { - LOG.error(StringUtils.stringifyException(e)); - } - try { - train(); - } catch (Exception e) { - - LOG.error("Error occured while training:: " - + StringUtils.stringifyException(e)); - - } - - } - - public Configuration getConf() { - return this.conf; - } - - @Override - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - - Parse parse = parseResult.get(content.getUrl()); - - String url = content.getBaseUrl(); - ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>(); - String text = parse.getText(); - - if (!filterParse(text)) { // kick in the second tier - // if parent page found - // irrelevant - LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url); - LOG.info("Checking outlinks"); - - Outlink[] out = null; - for (int i = 0; i < parse.getData().getOutlinks().length; i++) { - LOG.info("ParseFilter: NaiveBayes: Outlink to check:: " - + parse.getData().getOutlinks()[i].getToUrl()); - if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) { - tempOutlinks.add(parse.getData().getOutlinks()[i]); - LOG.info("ParseFilter: NaiveBayes: found relevant"); - - } else { - LOG.info("ParseFilter: NaiveBayes: found irrelevant"); - } - } - out = new Outlink[tempOutlinks.size()]; - for (int i = 0; i < tempOutlinks.size(); i++) { - out[i] = tempOutlinks.get(i); - } - parse.getData().setOutlinks(out); - - } else { - LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url); - } - - return parseResult; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java deleted file mode 100644 index 19a6911..0000000 --- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parsefilter.naivebayes; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.HashMap; -import java.util.HashSet; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -public class Train { - - public static String replacefirstoccuranceof(String tomatch, String line) { - - int index = line.indexOf(tomatch); - if (index == -1) { - return line; - } else { - return line.substring(0, index) - + line.substring(index + tomatch.length()); - } - - } - - public static void updateHashMap(HashMap<String, Integer> dict, String key) { - if (!key.equals("")) { - if (dict.containsKey(key)) - dict.put(key, dict.get(key) + 1); - else - dict.put(key, 1); - } - } - - public static String flattenHashMap(HashMap<String, Integer> dict) { - String result = ""; - - for (String key : dict.keySet()) { - - result += key + ":" + dict.get(key) + ","; - } - - // remove the last comma - result = result.substring(0, result.length() - 1); - - return result; - } - - public static void start(String filepath) throws IOException { - - // two classes 0/irrelevant and 1/relevant - - // calculate the total number of instances/examples per class, word count in - // each class and for each class a word:frequency map - - int numof_ir = 0; - int numof_r = 0; - int numwords_ir = 0; - int numwords_r = 0; - HashSet<String> uniquewords = new HashSet<String>(); - HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>(); - HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>(); - - String line = ""; - String target = ""; - String[] linearray = null; - - // read the line - Configuration configuration = new Configuration(); - FileSystem fs = FileSystem.get(configuration); - - BufferedReader bufferedReader = new BufferedReader( - configuration.getConfResourceAsReader(filepath)); - - while ((line = bufferedReader.readLine()) != null) { - - target = line.split("\t")[0]; - - line = replacefirstoccuranceof(target + "\t", line); - - linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" "); - - // update the data structures - if (target.equals("0")) { - - numof_ir += 1; - numwords_ir += linearray.length; - for (int i = 0; i < linearray.length; i++) { - uniquewords.add(linearray[i]); - updateHashMap(wordfreq_ir, linearray[i]); - } - } else { - - numof_r += 1; - numwords_r += linearray.length; - for (int i = 0; i < linearray.length; i++) { - uniquewords.add(linearray[i]); - updateHashMap(wordfreq_r, linearray[i]); - } - - } - - } - - // write the model file - - Path path = new Path("naivebayes-model"); - - Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, - true))); - - writer.write(String.valueOf(uniquewords.size()) + "\n"); - writer.write("0\n"); - writer.write(String.valueOf(numof_ir) + "\n"); - writer.write(String.valueOf(numwords_ir) + "\n"); - writer.write(flattenHashMap(wordfreq_ir) + "\n"); - writer.write("1\n"); - writer.write(String.valueOf(numof_r) + "\n"); - writer.write(String.valueOf(numwords_r) + "\n"); - writer.write(flattenHashMap(wordfreq_r) + "\n"); - - writer.close(); - - bufferedReader.close(); - - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java deleted file mode 100644 index 6a892be..0000000 --- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Html Parse filter that classifies the outlinks from the parseresult as - * relevant or irrelevant based on the parseText's relevancy (using a training - * file where you can give positive and negative example texts see the - * description of parsefilter.naivebayes.trainfile) and if found irrelevent - * it gives the link a second chance if it contains any of the words from the - * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the - * parser.timeout to -1 or a bigger value than 30, when using this classifier. - */ -package org.apache.nutch.parsefilter.naivebayes; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/build.xml b/src/plugin/parsefilter-regex/build.xml deleted file mode 100644 index 14d1127..0000000 --- a/src/plugin/parsefilter-regex/build.xml +++ /dev/null @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parsefilter-regex" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- for junit test --> - <mkdir dir="${build.test}/data"/> - <copy todir="${build.test}/data"> - <fileset dir="data" /> - </copy> -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/data/regex-parsefilter.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt deleted file mode 100644 index 9d15cd8..0000000 --- a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Example configuration file for parsefilter-regex -# -# Parse metadata field <name> is set to true if the HTML matches the regex. The -# source can either be html or text. If source is html, the regex is applied to -# the entire HTML tree. If source is text, the regex is applied to the -# extracted text. -# -# format: <name>\t<source>\t<regex>\n -first html h1 -second text blablabla http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/ivy.xml b/src/plugin/parsefilter-regex/ivy.xml deleted file mode 100644 index ed4cbc3..0000000 --- a/src/plugin/parsefilter-regex/ivy.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/plugin.xml b/src/plugin/parsefilter-regex/plugin.xml deleted file mode 100644 index 0725492..0000000 --- a/src/plugin/parsefilter-regex/plugin.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parsefilter-regex" - name="Regex Parse Filter" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="parsefilter-regex.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.htmlparsefilter.regex" - name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="RegexParseFilter" - class="org.apache.nutch.parsefilter.regex.RegexParseFilter"> - <parameter name="file" value="regex-parsefilter.txt"/> - </implementation> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java deleted file mode 100644 index 0752c91..0000000 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parsefilter.regex; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.FileReader; -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.plugin.Extension; -import org.apache.nutch.plugin.PluginRepository; -import org.apache.nutch.protocol.Content; - -import org.apache.commons.lang.StringUtils; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.w3c.dom.*; - -/** - * RegexParseFilter. If a regular expression matches either HTML or - * extracted text, a configurable field is set to true. - */ -public class RegexParseFilter implements HtmlParseFilter { - - private static final Logger LOG = LoggerFactory.getLogger(RegexParseFilter.class); - private static String attributeFile = null; - private String regexFile = null; - - private Configuration conf; - private DocumentFragment doc; - - private static final Map<String,RegexRule> rules = new HashMap<String,RegexRule>(); - - public RegexParseFilter() {} - - public RegexParseFilter(String regexFile) { - this.regexFile = regexFile; - } - - public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - Parse parse = parseResult.get(content.getUrl()); - String html = new String(content.getContent()); - String text = parse.getText(); - - for (Map.Entry<String, RegexRule> entry : rules.entrySet()) { - String field = entry.getKey(); - RegexRule regexRule = entry.getValue(); - - String source = null; - if (regexRule.source.equalsIgnoreCase("html")) { - source = html; - } - if (regexRule.source.equalsIgnoreCase("text")) { - source = text; - } - - if (source == null) { - LOG.error("source for regex rule: " + field + " misconfigured"); - } - - if (matches(source, regexRule.regex)) { - parse.getData().getParseMeta().set(field, "true"); - } else { - parse.getData().getParseMeta().set(field, "false"); - } - } - - return parseResult; - } - - public void setConf(Configuration conf) { - this.conf = conf; - - // get the extensions for domain urlfilter - String pluginName = "parsefilter-regex"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - HtmlParseFilter.class.getName()).getExtensions(); - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - if (extension.getDescriptor().getPluginId().equals(pluginName)) { - attributeFile = extension.getAttribute("file"); - break; - } - } - - // handle blank non empty input - if (attributeFile != null && attributeFile.trim().equals("")) { - attributeFile = null; - } - - if (attributeFile != null) { - if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); - } - } - else { - if (LOG.isWarnEnabled()) { - LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); - } - } - - // domain file and attribute "file" take precedence if defined - String file = conf.get("parsefilter.regex.file"); - String stringRules = conf.get("parsefilter.regex.rules"); - if (regexFile != null) { - file = regexFile; - } - else if (attributeFile != null) { - file = attributeFile; - } - Reader reader = null; - if (stringRules != null) { // takes precedence over files - reader = new StringReader(stringRules); - } else { - reader = conf.getConfResourceAsReader(file); - } - try { - if (reader == null) { - reader = new FileReader(file); - } - readConfiguration(reader); - } - catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } - - public Configuration getConf() { - return this.conf; - } - - private boolean matches(String value, Pattern pattern) { - if (value != null) { - Matcher matcher = pattern.matcher(value); - return matcher.find(); - } - - return false; - } - - private synchronized void readConfiguration(Reader configReader) throws IOException { - if (rules.size() > 0) { - return; - } - - String line; - BufferedReader reader = new BufferedReader(configReader); - while ((line = reader.readLine()) != null) { - if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { - line = line.trim(); - String[] parts = line.split("\t"); - - String field = parts[0].trim(); - String source = parts[1].trim(); - String regex = parts[2].trim(); - - rules.put(field, new RegexRule(source, regex)); - } - } - } - - private static class RegexRule { - public RegexRule(String source, String regex) { - this.source = source; - this.regex = Pattern.compile(regex); - } - String source; - Pattern regex; - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java deleted file mode 100644 index f8f46ee..0000000 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * RegexParseFilter. If a regular expression matches either HTML or - * extracted text, a configurable field is set to true. - */ -package org.apache.nutch.parsefilter.regex; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java deleted file mode 100644 index 9bd7149..0000000 --- a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parsefilter.regex; - -import java.net.MalformedURLException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; -import junit.framework.TestCase; - -public class TestRegexParseFilter extends TestCase { - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - public void testPositiveFilter() throws Exception { - Configuration conf = NutchConfiguration.create(); - - String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; - RegexParseFilter filter = new RegexParseFilter(file); - filter.setConf(conf); - - String url = "http://nutch.apache.org/"; - String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>"; - Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); - Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData()); - - ParseResult result = ParseResult.createParseResult(url, parse); - result = filter.filter(content, result, null, null); - - Metadata meta = parse.getData().getParseMeta(); - - assertEquals("true", meta.get("first")); - assertEquals("true", meta.get("second")); - } - - public void testNegativeFilter() throws Exception { - Configuration conf = NutchConfiguration.create(); - - String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; - RegexParseFilter filter = new RegexParseFilter(file); - filter.setConf(conf); - - String url = "http://nutch.apache.org/"; - String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>"; - Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); - Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData()); - - ParseResult result = ParseResult.createParseResult(url, parse); - result = filter.filter(content, result, null, null); - - Metadata meta = parse.getData().getParseMeta(); - - assertEquals("false", meta.get("first")); - assertEquals("false", meta.get("second")); - } -} \ No newline at end of file
