http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java deleted file mode 100644 index 5089a10..0000000 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.html; - -import org.apache.nutch.parse.HTMLMetaTags; - -import java.io.ByteArrayInputStream; -import java.net.URL; - -import org.cyberneko.html.parsers.*; -import org.junit.Assert; -import org.junit.Test; -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; - -/** Unit tests for HTMLMetaProcessor. */ -public class TestRobotsMetaProcessor { - - /* - * - * some sample tags: - * - * <meta name="robots" content="index,follow"> <meta name="robots" - * content="noindex,follow"> <meta name="robots" content="index,nofollow"> - * <meta name="robots" content="noindex,nofollow"> - * - * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - */ - - public static String[] tests = { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" - + " some text" + "</body></html>", - - }; - - public static final boolean[][] answers = { { true, true, true }, // NONE - { false, false, true }, // all - { true, true, true }, // nOnE - { true, true, false }, // none - { true, true, false }, // noindex,nofollow - { true, false, false }, // noindex,follow - { false, true, false }, // index,nofollow - { false, false, false }, // index,follow - { false, false, false }, // missing! - }; - - private URL[][] currURLsAndAnswers; - - @Test - public void testRobotsMetaProcessor() { - DOMFragmentParser parser = new DOMFragmentParser(); - ; - - try { - currURLsAndAnswers = new URL[][] { - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/") }, - { new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/") } }; - } catch (Exception e) { - Assert.assertTrue("couldn't make test URLs!", false); - } - - for (int i = 0; i < tests.length; i++) { - byte[] bytes = tests[i].getBytes(); - - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - - try { - parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); - } catch (Exception e) { - e.printStackTrace(); - } - - HTMLMetaTags robotsMeta = new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); - - Assert.assertTrue("got index wrong on test " + i, - robotsMeta.getNoIndex() == answers[i][0]); - Assert.assertTrue("got follow wrong on test " + i, - robotsMeta.getNoFollow() == answers[i][1]); - Assert.assertTrue("got cache wrong on test " + i, - robotsMeta.getNoCache() == answers[i][2]); - Assert - .assertTrue( - "got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) - || ((robotsMeta.getBaseHref() != null) && robotsMeta - .getBaseHref().equals(currURLsAndAnswers[i][1]))); - - } - } - -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/build.xml b/src/plugin/parse-js/build.xml deleted file mode 100644 index d9c2146..0000000 --- a/src/plugin/parse-js/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-js" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/ivy.xml b/src/plugin/parse-js/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/parse-js/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/plugin.xml b/src/plugin/parse-js/plugin.xml deleted file mode 100644 index 9c06c2a..0000000 --- a/src/plugin/parse-js/plugin.xml +++ /dev/null @@ -1,53 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parse-js" - name="JavaScript Parser" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="parse-js.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.parse.js" - name="JS Parser" - point="org.apache.nutch.parse.Parser"> - <implementation id="JSParser" - class="org.apache.nutch.parse.js.JSParseFilter"> - <parameter name="contentType" value="application/x-javascript"/> - <parameter name="pathSuffix" value="js"/> - </implementation> - </extension> - <extension id="org.apache.nutch.parse.js.JSParseFilter" - name="Parse JS Filter" - point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="JSParseFilter" - class="org.apache.nutch.parse.js.JSParseFilter"> - <parameter name="contentType" value="application/x-javascript"/> - <parameter name="pathSuffix" value=""/> - </implementation> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java deleted file mode 100644 index 8c95372..0000000 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.js; - -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseText; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.conf.Configuration; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.Pattern; -import org.apache.oro.text.regex.PatternCompiler; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.PatternMatcherInput; -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; -import org.w3c.dom.DocumentFragment; -import org.w3c.dom.Element; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - -/** - * This class is a heuristic link extractor for JavaScript files and code - * snippets. The general idea of a two-pass regex matching comes from Heritrix. - * Parts of the code come from OutlinkExtractor.java - */ -public class JSParseFilter implements HtmlParseFilter, Parser { - public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class); - - private static final int MAX_TITLE_LEN = 80; - - private Configuration conf; - - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - - Parse parse = parseResult.get(content.getUrl()); - - String url = content.getBaseUrl(); - ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); - walk(doc, parse, metaTags, url, outlinks); - if (outlinks.size() > 0) { - Outlink[] old = parse.getData().getOutlinks(); - String title = parse.getData().getTitle(); - List<Outlink> list = Arrays.asList(old); - outlinks.addAll(list); - ParseStatus status = parse.getData().getStatus(); - String text = parse.getText(); - Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks - .size()]); - ParseData parseData = new ParseData(status, title, newlinks, parse - .getData().getContentMeta(), parse.getData().getParseMeta()); - - // replace original parse obj with new one - parseResult.put(content.getUrl(), new ParseText(text), parseData); - } - return parseResult; - } - - private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, - List<Outlink> outlinks) { - if (n instanceof Element) { - String name = n.getNodeName(); - if (name.equalsIgnoreCase("script")) { - /* - * String lang = null; Node lNode = - * n.getAttributes().getNamedItem("language"); if (lNode == null) lang = - * "javascript"; else lang = lNode.getNodeValue(); - */ - StringBuffer script = new StringBuffer(); - NodeList nn = n.getChildNodes(); - if (nn.getLength() > 0) { - for (int i = 0; i < nn.getLength(); i++) { - if (i > 0) - script.append('\n'); - script.append(nn.item(i).getNodeValue()); - } - // if (LOG.isInfoEnabled()) { - // LOG.info("script: language=" + lang + ", text: " + - // script.toString()); - // } - Outlink[] links = getJSLinks(script.toString(), "", base); - if (links != null && links.length > 0) - outlinks.addAll(Arrays.asList(links)); - // no other children of interest here, go one level up. - return; - } - } else { - // process all HTML 4.0 events, if present... - NamedNodeMap attrs = n.getAttributes(); - int len = attrs.getLength(); - for (int i = 0; i < len; i++) { - // Window: onload,onunload - // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus - // Keyboard: onkeydown,onkeypress,onkeyup - // Mouse: - // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup - Node anode = attrs.item(i); - Outlink[] links = null; - if (anode.getNodeName().startsWith("on")) { - links = getJSLinks(anode.getNodeValue(), "", base); - } else if (anode.getNodeName().equalsIgnoreCase("href")) { - String val = anode.getNodeValue(); - if (val != null && val.toLowerCase().indexOf("javascript:") != -1) { - links = getJSLinks(val, "", base); - } - } - if (links != null && links.length > 0) - outlinks.addAll(Arrays.asList(links)); - } - } - } - NodeList nl = n.getChildNodes(); - for (int i = 0; i < nl.getLength(); i++) { - walk(nl.item(i), parse, metaTags, base, outlinks); - } - } - - public ParseResult getParse(Content c) { - String type = c.getContentType(); - if (type != null && !type.trim().equals("") - && !type.toLowerCase().startsWith("application/x-javascript")) - return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, - "Content not JavaScript: '" + type + "'").getEmptyParseResult( - c.getUrl(), getConf()); - String script = new String(c.getContent()); - Outlink[] outlinks = getJSLinks(script, "", c.getUrl()); - if (outlinks == null) - outlinks = new Outlink[0]; - // Title? use the first line of the script... - String title; - int idx = script.indexOf('\n'); - if (idx != -1) { - if (idx > MAX_TITLE_LEN) - idx = MAX_TITLE_LEN; - title = script.substring(0, idx); - } else { - idx = Math.min(MAX_TITLE_LEN, script.length()); - title = script.substring(0, idx); - } - ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, - c.getMetadata()); - return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd)); - } - - private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)"; - // A simple pattern. This allows also invalid URL characters. - private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)"; - - // Alternative pattern, which limits valid url characters. - // private static final String URI_PATTERN = - // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)"; - - /** - * This method extracts URLs from literals embedded in JavaScript. - */ - private Outlink[] getJSLinks(String plainText, String anchor, String base) { - - final List<Outlink> outlinks = new ArrayList<Outlink>(); - URL baseURL = null; - - try { - baseURL = new URL(base); - } catch (Exception e) { - if (LOG.isErrorEnabled()) { - LOG.error("getJSLinks", e); - } - } - - try { - final PatternCompiler cp = new Perl5Compiler(); - final Pattern pattern = cp.compile(STRING_PATTERN, - Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); - final Pattern pattern1 = cp.compile(URI_PATTERN, - Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); - final PatternMatcher matcher = new Perl5Matcher(); - - final PatternMatcher matcher1 = new Perl5Matcher(); - final PatternMatcherInput input = new PatternMatcherInput(plainText); - - MatchResult result; - String url; - - // loop the matches - while (matcher.contains(input, pattern)) { - result = matcher.getMatch(); - url = result.group(2); - PatternMatcherInput input1 = new PatternMatcherInput(url); - if (!matcher1.matches(input1, pattern1)) { - // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); - // } - continue; - } - if (url.startsWith("www.")) { - url = "http://" + url; - } else { - // See if candidate URL is parseable. If not, pass and move on to - // the next match. - try { - url = new URL(baseURL, url).toString(); - } catch (MalformedURLException ex) { - if (LOG.isTraceEnabled()) { - LOG.trace(" - failed URL parse '" + url + "' and baseURL '" - + baseURL + "'", ex); - } - continue; - } - } - url = url.replaceAll("&", "&"); - if (LOG.isTraceEnabled()) { - LOG.trace(" - outlink from JS: '" + url + "'"); - } - outlinks.add(new Outlink(url, anchor)); - } - } catch (Exception ex) { - // if it is a malformed URL we just throw it away and continue with - // extraction. - if (LOG.isErrorEnabled()) { - LOG.error("getJSLinks", ex); - } - } - - final Outlink[] retval; - - // create array of the Outlinks - if (outlinks != null && outlinks.size() > 0) { - retval = (Outlink[]) outlinks.toArray(new Outlink[0]); - } else { - retval = new Outlink[0]; - } - - return retval; - } - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println(JSParseFilter.class.getName() + " file.js baseURL"); - return; - } - InputStream in = new FileInputStream(args[0]); - BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); - StringBuffer sb = new StringBuffer(); - String line = null; - while ((line = br.readLine()) != null) - sb.append(line + "\n"); - br.close(); - - JSParseFilter parseFilter = new JSParseFilter(); - parseFilter.setConf(NutchConfiguration.create()); - Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]); - System.out.println("Outlinks extracted: " + links.length); - for (int i = 0; i < links.length; i++) - System.out.println(" - " + links[i]); - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java deleted file mode 100644 index 36d0d14..0000000 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parser and parse filter plugin to extract all (possible) links - * from JavaScript files and embedded JavaScript code snippets. - */ -package org.apache.nutch.parse.js; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/README.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/README.txt b/src/plugin/parse-metatags/README.txt deleted file mode 100644 index 0d5b009..0000000 --- a/src/plugin/parse-metatags/README.txt +++ /dev/null @@ -1,17 +0,0 @@ -Parse-metatags plugin - -The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'. -In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml - -<property> - <name>metatags.names</name> - <value>description;keywords</value> -</property> - -Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'. - -This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com - - - - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/build.xml b/src/plugin/parse-metatags/build.xml deleted file mode 100644 index e30292d..0000000 --- a/src/plugin/parse-metatags/build.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-metatags" default="jar-core"> - - <import file="../build-plugin.xml" /> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> - <ant target="deploy" inheritall="false" dir="../protocol-file" /> - </target> - - - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="sample"> - <include name="*.html" /> - </fileset> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/parse-metatags/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/plugin.xml b/src/plugin/parse-metatags/plugin.xml deleted file mode 100644 index 07933fa..0000000 --- a/src/plugin/parse-metatags/plugin.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<plugin - id="parse-metatags" - name="MetaTags" - version="1.0" - provider-name="digitalpebble.com"> - - <runtime> - <library name="parse-metatags.jar"> - <export name="*"/> - </library> - </runtime> - - <extension id="org.apache.nutch.parse.metatags.parser" - name="MetaTags Parser" - point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="MetaTagsParser" - class="org.apache.nutch.parse.metatags.MetaTagsParser"/> - </extension> - -</plugin> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/sample/testMetatags.html ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/sample/testMetatags.html b/src/plugin/parse-metatags/sample/testMetatags.html deleted file mode 100644 index e9e8e6b..0000000 --- a/src/plugin/parse-metatags/sample/testMetatags.html +++ /dev/null @@ -1,9 +0,0 @@ -<html> -<head> -<meta name="Keywords" content="This is a test of keywords" /> -<meta name="Description" content="This is a test of description" /> -</head> -<body> -text of the document -</body> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/sample/testMultivalueMetatags.html ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html deleted file mode 100644 index ca8b737..0000000 --- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html +++ /dev/null @@ -1,12 +0,0 @@ -<html> -<head> -<meta name="DC.creator" content="Doug Cutting"> -<meta name="DC.creator" content="Michael Cafarella"> -<!-- meta keywords in different casing --> -<meta name="keywords" lang="en" content="web crawler" /> -<meta name="Keywords" lang="fr" content="robot d'indexation" /> -<meta name="KEYWORDS" lang="de" content="Webcrawler" /> -</head> -<body> -A test for multi-valued metatags. -</body> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java deleted file mode 100644 index f9b9722..0000000 --- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.metatags; - -import java.util.Enumeration; -import java.util.HashSet; -import java.util.Locale; -import java.util.Properties; -import java.util.Set; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.w3c.dom.DocumentFragment; - -/** - * Parse HTML meta tags (keywords, description) and store them in the parse - * metadata so that they can be indexed with the index-metadata plugin with the - * prefix 'metatag.'. Metatags are matched ignoring case. - */ -public class MetaTagsParser implements HtmlParseFilter { - - private static final Log LOG = LogFactory.getLog(MetaTagsParser.class - .getName()); - - private Configuration conf; - - private Set<String> metatagset = new HashSet<String>(); - - public void setConf(Configuration conf) { - this.conf = conf; - // specify whether we want a specific subset of metadata - // by default take everything we can find - String[] values = conf.getStrings("metatags.names", "*"); - for (String val : values) { - metatagset.add(val.toLowerCase(Locale.ROOT)); - } - } - - public Configuration getConf() { - return this.conf; - } - - /** - * Check whether the metatag is in the list of metatags to be indexed (or if - * '*' is specified). If yes, add it to parse metadata. - */ - private void addIndexedMetatags(Metadata metadata, String metatag, - String value) { - String lcMetatag = metatag.toLowerCase(Locale.ROOT); - if (metatagset.contains("*") || metatagset.contains(lcMetatag)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Found meta tag: " + lcMetatag + "\t" + value); - } - metadata.add("metatag." + lcMetatag, value); - } - } - - /** - * Check whether the metatag is in the list of metatags to be indexed (or if - * '*' is specified). If yes, add it with all values to parse metadata. - */ - private void addIndexedMetatags(Metadata metadata, String metatag, - String[] values) { - String lcMetatag = metatag.toLowerCase(Locale.ROOT); - if (metatagset.contains("*") || metatagset.contains(lcMetatag)) { - for (String value : values) { - if (LOG.isDebugEnabled()) { - LOG.debug("Found meta tag: " + lcMetatag + "\t" + value); - } - metadata.add("metatag." + lcMetatag, value); - } - } - } - - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - - Parse parse = parseResult.get(content.getUrl()); - Metadata metadata = parse.getData().getParseMeta(); - - // check in the metadata first : the tika-parser - // might have stored the values there already - for (String mdName : metadata.names()) { - addIndexedMetatags(metadata, mdName, metadata.getValues(mdName)); - } - - Metadata generalMetaTags = metaTags.getGeneralTags(); - for (String tagName : generalMetaTags.names()) { - addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName)); - } - - Properties httpequiv = metaTags.getHttpEquivTags(); - for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames - .hasMoreElements();) { - String name = (String) tagNames.nextElement(); - String value = httpequiv.getProperty(name); - addIndexedMetatags(metadata, name, value); - } - - return parseResult; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java deleted file mode 100644 index a55cf5c..0000000 --- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse filter to extract meta tags: keywords, description, etc. - * Used in combination with index-metadata plugin - * (see {@link org.apache.nutch.indexer.metadata}). - */ -package org.apache.nutch.parse.metatags; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java deleted file mode 100644 index 024aadf..0000000 --- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.metatags; - -import java.util.Set; -import java.util.TreeSet; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestMetatagParser { - - private String fileSeparator = System.getProperty("file.separator"); - private String sampleDir = System.getProperty("test.data", "."); - private String sampleFile = "testMetatags.html"; - private String sampleFileMultival = "testMultivalueMetatags.html"; - private String description = "This is a test of description"; - private String keywords = "This is a test of keywords"; - - public Metadata parseMeta(String fileName, Configuration conf) { - Metadata metadata = null; - try { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - metadata = parse.getData().getParseMeta(); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - return metadata; - } - - @Test - /** test defaults: keywords and description */ - public void testIt() { - Configuration conf = NutchConfiguration.create(); - - // check that we get the same values - Metadata parseMeta = parseMeta(sampleFile, conf); - - Assert.assertEquals(description, parseMeta.get("metatag.description")); - Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); - } - - @Test - /** test multiple metatags resulting in metadata with multiple values */ - public void testMultiValueMetatags() { - Configuration conf = NutchConfiguration.create(); - conf.set("metatags.names", "keywords,DC.creator"); - conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator"); - - Metadata parseMeta = parseMeta(sampleFileMultival, conf); - - String failMessage = "One value of metatag with multiple values is missing: "; - - Set<String> valueSet = new TreeSet<String>(); - for (String val : parseMeta.getValues("metatag.dc.creator")) { - valueSet.add(val); - } - String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" }; - for (String val : expectedValues1) { - Assert.assertTrue(failMessage + val, valueSet.contains(val)); - } - - valueSet.clear(); - for (String val : parseMeta.getValues("metatag.keywords")) { - valueSet.add(val); - } - String[] expectedValues2 = { "robot d'indexation", "web crawler", - "Webcrawler" }; - for (String val : expectedValues2) { - Assert.assertTrue(failMessage + val, valueSet.contains(val)); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/README.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/README.txt b/src/plugin/parse-replace/README.txt deleted file mode 100644 index a18bd9c..0000000 --- a/src/plugin/parse-replace/README.txt +++ /dev/null @@ -1,91 +0,0 @@ -ParseReplace plugin - -Allows post-parsing regexp replace manipulation of metadata fields. - -Configuration Example - <property> - <name>parse.replace.regexp</name> - <value> - id=/file:/http:/ - url=/file:/http:/128 - </value> - </property - -Property format: parse.replace.regexp - The format of the property is a list of regexp replacements, one line per field being - modified. Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure. - - The fieldname preceeds the equal sign. The first character after the equal sign signifies - the delimiter for the regexp, the replacement value and the flags. - -Replacement Sequence - The replacements will happen in the order listed. If a field needs multiple replacement operations - they may be listed more than once. - -RegExp Format - The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined - here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29 - Patterns are compiled when the plugin is initialized for efficiency. - -Replacement Format - The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement): - http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29 - -Flags - The flags is an integer sum of the flag values defined in - http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern) - -Escaping - Since the regexp is being read from a config file, any escaped values must be double - escaped. Eg: id=/\\s+// will cause the esacped \s+ match pattern to be used. - -Multi-valued Fields - If a field has multiple values, the replacement will be applied to each value in turn. - -Non-string Datatypes - Replacement is possible only on String field datatypes. If the field you name in the property is - not a String datatype, it will be silently ignored. - -Host and URL specifc replacements. - If the replacements should apply only to specifc pages, then add a sequence like - - hostmatch=/host match pattern/ - fld1=/regexp/replace/flags - fld2=/regexp/replace/flags - - or - urlmatch=/url match pattern/ - fld1=/regexp/replace/flags - fld2=/regexp/replace/flags - -When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch -will apply to all parsed pages. Replacements following a hostmatch or urlmatch will be applied -to pages which match the host or url field (up to the next hostmatch or urlmatch line). hostmatch -and urlmatch patterns must be unique in this property. - -Plugin order - TBD... But in most cases you will want this plugin to run last. - -Testing your match patterns - Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html - can help get the basics of your pattern working. - To test in nutch: - Prepare a test HTML file with the field contents you want to test. - Place this in a directory accessible to nutch. - Use the file:/// syntax to list the test file(s) in a test/urls seed list. - See the nutch faq "index my local file system" for conf settings you will need. - (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This - test approach confirms only how your global matches behave, unless your urlmatch and hostmatch - patterns also match the file: URL pattern) - - Run.. - bin/nutch inject crawl/crawldb test - bin/nutch generate crawl/crawldb crawl/segments - bin/nutch fetch crawl/segments/[segment] - bin/nutch parse crawl/segments/[segment] - - To inspect the returned fields... - bin/nutch readseg -dump crawl/segments/[segment] testout - less testout/dump - - To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/build.xml b/src/plugin/parse-replace/build.xml deleted file mode 100644 index ca5ccf7..0000000 --- a/src/plugin/parse-replace/build.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-replace" default="jar-core"> - - <import file="../build-plugin.xml" /> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> - <ant target="deploy" inheritall="false" dir="../protocol-file" /> - </target> - - - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="sample"> - <include name="*.html" /> - </fileset> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/ivy.xml b/src/plugin/parse-replace/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/parse-replace/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/plugin.xml b/src/plugin/parse-replace/plugin.xml deleted file mode 100644 index 6368210..0000000 --- a/src/plugin/parse-replace/plugin.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<plugin - id="parse-replace" - name="ReplaceParser" - version="1.0" - provider-name="PeterCiuffetti"> - - <runtime> - <library name="parse-replace.jar"> - <export name="*"/> - </library> - </runtime> - - <extension id="org.apache.nutch.parse.replace.parser" - name="Replace Parser" - point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="ReplaceParser" - class="org.apache.nutch.parse.replace.ReplaceParser"/> - </extension> - -</plugin> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/sample/testParseReplace.html ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/sample/testParseReplace.html b/src/plugin/parse-replace/sample/testParseReplace.html deleted file mode 100644 index 825dcb9..0000000 --- a/src/plugin/parse-replace/sample/testParseReplace.html +++ /dev/null @@ -1,11 +0,0 @@ -<html> - <head> - <title>Testing the power of parser-replace plugin</title> - <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!"> - <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!"> - <meta name="author" content="Peter Ciuffetti"> - </head> - <body> - <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p> - </body> -</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java b/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java deleted file mode 100644 index 9773c4a..0000000 --- a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.replace; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.w3c.dom.DocumentFragment; - -/** - * Do pattern replacements on selected field contents - * prior to indexing. - */ -public class ReplaceParser implements HtmlParseFilter { - - private static final Log LOG = LogFactory.getLog(ReplaceParser.class - .getName()); - - private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap(); - private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap(); - - private Configuration conf; - - private Set<String> metatagset = new HashSet<String>(); - - public void setConf(Configuration conf) { - this.conf = conf; - String[] values = conf.getStrings("parse.replace.regexp", null); - if (values != null) { - this.parseConf(values); - } - } - - public Configuration getConf() { - return this.conf; - } - - private void parseConf(String[] values) { - - } - - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - - Parse parse = parseResult.get(content.getUrl()); - - return parseResult; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java b/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java deleted file mode 100644 index b678f00..0000000 --- a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse filter to allow pattern replacements on parsed metadata. - */ -package org.apache.nutch.parse.replace; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java deleted file mode 100644 index 593d5ed..0000000 --- a/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.replace; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestParseReplace { - - private String fileSeparator = System.getProperty("file.separator"); - private String sampleDir = System.getProperty("test.data", "."); - private String sampleFile = "testParseReplace.html"; - private String description = "This is a test of description"; - private String keywords = "This is a test of keywords"; - - public Metadata parseMeta(String fileName, Configuration conf) { - Metadata metadata = null; - try { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - metadata = parse.getData().getParseMeta(); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - return metadata; - } - - @Test - /** test defaults: keywords and description */ - public void testIt() { - Configuration conf = NutchConfiguration.create(); - - // check that we get the same values - Metadata parseMeta = parseMeta(sampleFile, conf); - - Assert.assertEquals(description, parseMeta.get("metatag.description")); - Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/build.xml b/src/plugin/parse-swf/build.xml deleted file mode 100644 index f4fb20f..0000000 --- a/src/plugin/parse-swf/build.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-swf" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <ant target="deploy" inheritall="false" dir="../protocol-file"/> - </target> - - - <!-- for junit test --> - <mkdir dir="${build.test}/data"/> - <copy file="sample/test1.swf" todir="${build.test}/data"/> - <copy file="sample/test2.swf" todir="${build.test}/data"/> - <copy file="sample/test3.swf" todir="${build.test}/data"/> - <copy file="sample/test1.txt" todir="${build.test}/data"/> - <copy file="sample/test2.txt" todir="${build.test}/data"/> - <copy file="sample/test3.txt" todir="${build.test}/data"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/ivy.xml b/src/plugin/parse-swf/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/parse-swf/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/lib/javaswf-LICENSE.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt b/src/plugin/parse-swf/lib/javaswf-LICENSE.txt deleted file mode 100644 index 4138a66..0000000 --- a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt +++ /dev/null @@ -1,33 +0,0 @@ - - Copyright (c) 2001-2005, David N. Main, All rights reserved. - - Redistribution and use in source and binary forms, with or - without modification, are permitted provided that the - following conditions are met: - - 1. Redistributions of source code must retain the above - copyright notice, this list of conditions and the following - disclaimer. - - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - 3. The name of the author may not be used to endorse or - promote products derived from this software without specific - prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/lib/javaswf.jar ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/lib/javaswf.jar b/src/plugin/parse-swf/lib/javaswf.jar deleted file mode 100644 index 78f9b0b..0000000 Binary files a/src/plugin/parse-swf/lib/javaswf.jar and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/plugin.xml b/src/plugin/parse-swf/plugin.xml deleted file mode 100644 index 8cc72c0..0000000 --- a/src/plugin/parse-swf/plugin.xml +++ /dev/null @@ -1,44 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parse-swf" - name="SWF Parse Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="parse-swf.jar"> - <export name="*"/> - </library> - <library name="javaswf.jar"/> - </runtime> - - <extension id="org.apache.nutch.parse.swf" - name="SWFParse" - point="org.apache.nutch.parse.Parser"> - - <implementation id="org.apache.nutch.parse.swf.SWFParser" - class="org.apache.nutch.parse.swf.SWFParser"> - <parameter name="contentType" value="application/x-shockwave-flash"/> - <parameter name="pathSuffix" value="swf"/> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test1.swf ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/sample/test1.swf b/src/plugin/parse-swf/sample/test1.swf deleted file mode 100644 index cd2019b..0000000 Binary files a/src/plugin/parse-swf/sample/test1.swf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test1.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/sample/test1.txt b/src/plugin/parse-swf/sample/test1.txt deleted file mode 100644 index 68505d5..0000000 --- a/src/plugin/parse-swf/sample/test1.txt +++ /dev/null @@ -1,60 +0,0 @@ - --------- -/go/gnav_cart -/go/gnav_company -/go/gnav_devnet -/go/gnav_downloads -/go/gnav_fl_minmessage -/go/gnav_help -/go/gnav_mm_home -/go/gnav_products -/go/gnav_search?loc=en_us -/go/gnav_showcase -/go/gnav_solutions -/go/gnav_store -/go/gnav_support -/go/gnav_your_account -Acquisition Info -Adobe Home -AppleGothic -Array -Company -Developers -Downloads -Help -Home -International -LocaleManager -Macromedia Flash Player -Macromedia Home -MovieClip -Products -Showcase -Solutions -Store -String -Support -TextFormat -To ensure the best possible Internet Experience, please download the latest version of the free -Verdana -_sans -active -bluePill -button -color -company -devnet -downloads -en_us -home -javascript:openCrosslinkWindow('/go/adobeacquisition') -javascript:openCrosslinkWindow('/go/gnav_adobe_home') -products -rollOut -rollOver -selected -showcase -solutions -support -tabHolder -textColor http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test2.swf ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/sample/test2.swf b/src/plugin/parse-swf/sample/test2.swf deleted file mode 100644 index eb9b03d..0000000 Binary files a/src/plugin/parse-swf/sample/test2.swf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test2.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/sample/test2.txt b/src/plugin/parse-swf/sample/test2.txt deleted file mode 100644 index f77b78a..0000000 --- a/src/plugin/parse-swf/sample/test2.txt +++ /dev/null @@ -1,5 +0,0 @@ -Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini --------- -TextFormat -color -font http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test3.swf ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/sample/test3.swf b/src/plugin/parse-swf/sample/test3.swf deleted file mode 100644 index 4df9f1e..0000000 Binary files a/src/plugin/parse-swf/sample/test3.swf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test3.txt ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/sample/test3.txt b/src/plugin/parse-swf/sample/test3.txt deleted file mode 100644 index 66ae3d8..0000000 --- a/src/plugin/parse-swf/sample/test3.txt +++ /dev/null @@ -1,11 +0,0 @@ -Mix. - Edit. - Master. - Compose. - Animate. - With a single suite of powerful tools - that work together as one. - World-class video and audio tools that bring - new power and efficiency to your film, video, - DVD, and web workflows. - Learn more.
