http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java new file mode 100644 index 0000000..e7c55c4 --- /dev/null +++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.creativecommons.nutch; + +import org.apache.nutch.metadata.CreativeCommons; + +import org.apache.nutch.parse.Parse; + +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.hadoop.io.Text; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.metadata.Metadata; + +import org.apache.hadoop.conf.Configuration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.net.URL; +import java.net.MalformedURLException; + +/** Adds basic searchable fields to a document. */ +public class CCIndexingFilter implements IndexingFilter { + public static final Logger LOG = LoggerFactory + .getLogger(CCIndexingFilter.class); + + /** The name of the document field we use. */ + public static String FIELD = "cc"; + + private Configuration conf; + + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Metadata metadata = parse.getData().getParseMeta(); + // index the license + String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); + if (licenseUrl != null) { + if (LOG.isInfoEnabled()) { + LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); + } + + // add the entire license as cc:license=xxx + addFeature(doc, "license=" + licenseUrl); + + // index license attributes extracted of the license url + addUrlFeatures(doc, licenseUrl); + } + + // index the license location as cc:meta=xxx + String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); + if (licenseLocation != null) { + addFeature(doc, "meta=" + licenseLocation); + } + + // index the work type cc:type=xxx + String workType = metadata.get(CreativeCommons.WORK_TYPE); + if (workType != null) { + addFeature(doc, workType); + } + + return doc; + } + + /** + * Add the features represented by a license URL. Urls are of the form + * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a + * license feature. + */ + public void addUrlFeatures(NutchDocument doc, String urlString) { + try { + URL url = new URL(urlString); + + // tokenize the path of the url, breaking at slashes and dashes + StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); + + if (names.hasMoreTokens()) + names.nextToken(); // throw away "licenses" + + // add a feature per component after "licenses" + while (names.hasMoreTokens()) { + String feature = names.nextToken(); + addFeature(doc, feature); + } + } catch (MalformedURLException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("CC: failed to parse url: " + urlString + " : " + e); + } + } + } + + private void addFeature(NutchDocument doc, String feature) { + doc.add(FIELD, feature); + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java new file mode 100644 index 0000000..1fa951e --- /dev/null +++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java @@ -0,0 +1,300 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.creativecommons.nutch; + +import org.apache.nutch.metadata.CreativeCommons; +import org.apache.nutch.parse.*; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.metadata.Metadata; +import org.apache.hadoop.conf.Configuration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.io.*; +import java.net.*; +import javax.xml.parsers.*; +import org.xml.sax.InputSource; +import org.w3c.dom.*; + +/** Adds metadata identifying the Creative Commons license used, if any. */ +public class CCParseFilter implements HtmlParseFilter { + public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class); + + /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */ + public static class Walker { + private URL base; // base url of page + private String rdfLicense; // subject url found, if any + private URL relLicense; // license url found, if any + private URL anchorLicense; // anchor url found, if any + private String workType; // work type URI + + private Walker(URL base) { + this.base = base; + } + + /** Scan the document adding attributes to metadata. */ + public static void walk(Node doc, URL base, Metadata metadata, + Configuration conf) throws ParseException { + + // walk the DOM tree, scanning for license data + Walker walker = new Walker(base); + walker.walk(doc); + + // interpret results of walk + String licenseUrl = null; + String licenseLocation = null; + if (walker.rdfLicense != null) { // 1st choice: subject in RDF + licenseLocation = "rdf"; + licenseUrl = walker.rdfLicense; + } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license + licenseLocation = "rel"; + licenseUrl = walker.relLicense.toString(); + } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license + licenseLocation = "a"; + licenseUrl = walker.anchorLicense.toString(); + } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) { + throw new ParseException("No CC license. Excluding."); + } + + // add license to metadata + if (licenseUrl != null) { + if (LOG.isInfoEnabled()) { + LOG.info("CC: found " + licenseUrl + " in " + licenseLocation + + " of " + base); + } + metadata.add(CreativeCommons.LICENSE_URL, licenseUrl); + metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation); + } + + if (walker.workType != null) { + if (LOG.isInfoEnabled()) { + LOG.info("CC: found " + walker.workType + " in " + base); + } + metadata.add(CreativeCommons.WORK_TYPE, walker.workType); + } + + } + + /** Scan the document looking for RDF in comments and license elements. */ + private void walk(Node node) { + + // check element nodes for license URL + if (node instanceof Element) { + findLicenseUrl((Element) node); + } + + // check comment nodes for license RDF + if (node instanceof Comment) { + findRdf(((Comment) node).getData()); + } + + // recursively walk child nodes + NodeList children = node.getChildNodes(); + for (int i = 0; children != null && i < children.getLength(); i++) { + walk(children.item(i)); + } + } + + /** + * Extract license url from element, if any. Thse are the href attribute of + * anchor elements with rel="license". These must also point to + * http://creativecommons.org/licenses/. + */ + private void findLicenseUrl(Element element) { + // only look in Anchor elements + if (!"a".equalsIgnoreCase(element.getTagName())) + return; + + // require an href + String href = element.getAttribute("href"); + if (href == null) + return; + + try { + URL url = new URL(base, href); // resolve the url + + // check that it's a CC license URL + if ("http".equalsIgnoreCase(url.getProtocol()) + && "creativecommons.org".equalsIgnoreCase(url.getHost()) + && url.getPath() != null && url.getPath().startsWith("/licenses/") + && url.getPath().length() > "/licenses/".length()) { + + // check rel="license" + String rel = element.getAttribute("rel"); + if (rel != null && "license".equals(rel) && this.relLicense == null) { + this.relLicense = url; // found rel license + } else if (this.anchorLicense == null) { + this.anchorLicense = url; // found anchor license + } + } + } catch (MalformedURLException e) { // ignore malformed urls + } + } + + /** Configure a namespace aware XML parser. */ + private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory + .newInstance(); + static { + FACTORY.setNamespaceAware(true); + } + + /** Creative Commons' namespace URI. */ + private static final String CC_NS = "http://web.resource.org/cc/"; + + /** Dublin Core namespace URI. */ + private static final String DC_NS = "http://purl.org/dc/elements/1.1/"; + + /** RDF syntax namespace URI. */ + private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + private void findRdf(String comment) { + // first check for likely RDF in comment + int rdfPosition = comment.indexOf("RDF"); + if (rdfPosition < 0) + return; // no RDF, abort + int nsPosition = comment.indexOf(CC_NS); + if (nsPosition < 0) + return; // no RDF, abort + + // try to parse the XML + Document doc; + try { + DocumentBuilder parser = FACTORY.newDocumentBuilder(); + doc = parser.parse(new InputSource(new StringReader(comment))); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("CC: Failed to parse RDF in " + base + ": " + e); + } + // e.printStackTrace(); + return; + } + + // check that root is rdf:RDF + NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); + if (roots.getLength() != 1) { + if (LOG.isWarnEnabled()) { + LOG.warn("CC: No RDF root in " + base); + } + return; + } + Element rdf = (Element) roots.item(0); + + // get cc:License nodes inside rdf:RDF + NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); + for (int i = 0; i < licenses.getLength(); i++) { + + Element l = (Element) licenses.item(i); + + // license is rdf:about= attribute from cc:License + this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue(); + + // walk predicates of cc:License + NodeList predicates = l.getChildNodes(); + for (int j = 0; j < predicates.getLength(); j++) { + Node predicateNode = predicates.item(j); + if (!(predicateNode instanceof Element)) + continue; + Element predicateElement = (Element) predicateNode; + + // extract predicates of cc:xxx predicates + if (!CC_NS.equals(predicateElement.getNamespaceURI())) { + continue; + } + + // add object and predicate to metadata + // metadata.put(object, predicate); + // if (LOG.isInfoEnabled()) { + // LOG.info("CC: found: "+predicate+"="+object); + // } + } + } + + // get cc:Work nodes from rdf:RDF + NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work"); + for (int i = 0; i < works.getLength(); i++) { + // get dc:type nodes from cc:Work + NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); + + for (int j = 0; j < types.getLength(); j++) { + Element type = (Element) types.item(j); + String workUri = type.getAttributeNodeNS(RDF_NS, "resource") + .getValue(); + this.workType = WORK_TYPE_NAMES.get(workUri); + } + } + } + } + + private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>(); + static { + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", + "interactive"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); + } + + private Configuration conf; + + /** + * Adds metadata or otherwise modifies a parse of an HTML document, given the + * DOM tree of a page. + */ + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + // get parse obj + Parse parse = parseResult.get(content.getUrl()); + + // construct base url + URL base; + try { + base = new URL(content.getBaseUrl()); + } catch (MalformedURLException e) { + Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); + parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), + emptyParse.getData()); + return parseResult; + } + + try { + // extract license metadata + Walker.walk(doc, base, parse.getData().getParseMeta(), getConf()); + } catch (ParseException e) { + Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); + parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), + emptyParse.getData()); + return parseResult; + } + + return parseResult; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html new file mode 100644 index 0000000..0c91293 --- /dev/null +++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Sample plugins that parse and index Creative Commons medadata.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java new file mode 100755 index 0000000..41be9ed --- /dev/null +++ b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.creativecommons.nutch; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +import java.io.*; + +public class TestCCParseFilter { + + private static final File testDir = new File(System.getProperty("test.input")); + + @Test + public void testPages() throws Exception { + pageTest(new File(testDir, "anchor.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); + // Tika returns <a> whereas parse-html returns <rel> + // check later + pageTest(new File(testDir, "rel.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); + // Tika returns <a> whereas parse-html returns <rdf> + // check later + pageTest(new File(testDir, "rdf.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); + } + + public void pageTest(File file, String url, String license, String location, + String type) throws Exception { + + String contentType = "text/html"; + InputStream in = new FileInputStream(file); + ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length()); + byte[] buffer = new byte[1024]; + int i; + while ((i = in.read(buffer)) != -1) { + out.write(buffer, 0, i); + } + in.close(); + byte[] bytes = out.toByteArray(); + Configuration conf = NutchConfiguration.create(); + + Content content = new Content(url, url, bytes, contentType, new Metadata(), + conf); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + + Metadata metadata = parse.getData().getParseMeta(); + Assert.assertEquals(license, metadata.get("License-Url")); + Assert.assertEquals(location, metadata.get("License-Location")); + Assert.assertEquals(type, metadata.get("Work-Type")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/build.xml b/nutch-plugins/feed/build.xml new file mode 100644 index 0000000..7fe7050 --- /dev/null +++ b/nutch-plugins/feed/build.xml @@ -0,0 +1,45 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<project name="feed" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-xml"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-xml/*.jar" /> + </fileset> + </path> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" + dir="../nutch-extensionpoints" /> + <ant target="deploy" inheritall="false" dir="../protocol-file" /> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy file="sample/rsstest.rss" todir="${build.test}/data" /> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/ivy.xml b/nutch-plugins/feed/ivy.xml new file mode 100644 index 0000000..c29bd03 --- /dev/null +++ b/nutch-plugins/feed/ivy.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="rome" name="rome" rev="0.9" conf="*->master"/> + <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->master"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/plugin.xml b/nutch-plugins/feed/plugin.xml new file mode 100644 index 0000000..3a68d8d --- /dev/null +++ b/nutch-plugins/feed/plugin.xml @@ -0,0 +1,49 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0" + provider-name="nutch.org"> + <runtime> + <library name="feed.jar"> + <export name="*" /> + </library> + <library name="rome-0.9.jar" /> + <library name="jdom-1.1.jar" /> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints" /> + <import plugin="lib-xml" /> + </requires> + + <extension id="org.apache.nutch.parse.feed" name="Feed Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.feed.FeedParser" + class="org.apache.nutch.parse.feed.FeedParser"> + <parameter name="contentType" value="application/rss+xml" /> + <parameter name="contentType" value="application/atom+xml" /> + <parameter name="contentType" value="text/xml" /> + <parameter name="pathSuffix" value="rss" /> + </implementation> + </extension> + <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="FeedIndexingFilter" + class="org.apache.nutch.indexer.feed.FeedIndexingFilter" /> + </extension> +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/pom.xml b/nutch-plugins/feed/pom.xml new file mode 100644 index 0000000..d94c0b6 --- /dev/null +++ b/nutch-plugins/feed/pom.xml @@ -0,0 +1,45 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>feed</artifactId> + <packaging>jar</packaging> + + <name>feed</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>rome</groupId> + <artifactId>rome</artifactId> + <version>1.0</version> + </dependency> + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/sample/rsstest.rss ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/sample/rsstest.rss b/nutch-plugins/feed/sample/rsstest.rss new file mode 100644 index 0000000..758f6a1 --- /dev/null +++ b/nutch-plugins/feed/sample/rsstest.rss @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="ISO-8859-1" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<rss version="0.91"> + <channel> + <title>TestChannel</title> + <link>http://test.channel.com/</link> + <description>Sample RSS File for Junit test</description> + <language>en-us</language> + + <item> + <title>Home Page of Chris Mattmann</title> + <link>http://www-scf.usc.edu/~mattmann/</link> + <description>Chris Mattmann's home page</description> + </item> + <item> + <title>Awesome Open Source Search Engine</title> + <link>http://www.nutch.org/</link> + <description>Yup, that's what it is</description> + </item> + </channel> +</rss> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java new file mode 100644 index 0000000..94b440a --- /dev/null +++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java @@ -0,0 +1,129 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.feed; + +//JDK imports +import java.util.Date; + +//APACHE imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Feed; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; + +/** + * @author dogacan + * @author mattmann + * @since NUTCH-444 + * + * An {@link IndexingFilter} implementation to pull out the relevant + * extracted {@link Metadata} fields from the RSS feeds and into the + * index. + * + */ +public class FeedIndexingFilter implements IndexingFilter { + + public static final String dateFormatStr = "yyyyMMddHHmm"; + + private Configuration conf; + + private final static String PUBLISHED_DATE = "publishedDate"; + + private final static String UPDATED_DATE = "updatedDate"; + + /** + * Extracts out the relevant fields: + * + * <ul> + * <li>FEED_AUTHOR</li> + * <li>FEED_TAGS</li> + * <li>FEED_PUBLISHED</li> + * <li>FEED_UPDATED</li> + * <li>FEED</li> + * </ul> + * + * And sends them to the {@link Indexer} for indexing within the Nutch index. + * + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + ParseData parseData = parse.getData(); + Metadata parseMeta = parseData.getParseMeta(); + + String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR); + String[] tags = parseMeta.getValues(Feed.FEED_TAGS); + String published = parseMeta.get(Feed.FEED_PUBLISHED); + String updated = parseMeta.get(Feed.FEED_UPDATED); + String feed = parseMeta.get(Feed.FEED); + + if (authors != null) { + for (String author : authors) { + doc.add(Feed.FEED_AUTHOR, author); + } + } + + if (tags != null) { + for (String tag : tags) { + doc.add(Feed.FEED_TAGS, tag); + } + } + + if (feed != null) + doc.add(Feed.FEED, feed); + + if (published != null) { + Date date = new Date(Long.parseLong(published)); + doc.add(PUBLISHED_DATE, date); + } + + if (updated != null) { + Date date = new Date(Long.parseLong(updated)); + doc.add(UPDATED_DATE, date); + } + + return doc; + } + + /** + * @return the {@link Configuration} object used to configure this + * {@link IndexingFilter}. + */ + public Configuration getConf() { + return conf; + } + + /** + * Sets the {@link Configuration} object used to configure this + * {@link IndexingFilter}. + * + * @param conf + * The {@link Configuration} object used to configure this + * {@link IndexingFilter}. + */ + public void setConf(Configuration conf) { + this.conf = conf; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java new file mode 100644 index 0000000..8f52628 --- /dev/null +++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Indexing filter to index meta data from RSS feeds. + */ +package org.apache.nutch.indexer.feed; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java new file mode 100644 index 0000000..936c885 --- /dev/null +++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java @@ -0,0 +1,374 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.feed; + +// JDK imports +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; + +// APACHE imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.StringUtils; +// import org.apache.nutch.indexer.anchor.AnchorIndexingFilter; removed as per NUTCH-1078 +import org.apache.nutch.metadata.Feed; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParserNotFound; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.EncodingDetector; +import org.apache.nutch.util.NutchConfiguration; +import org.xml.sax.InputSource; + +// ROME imports +import com.sun.syndication.feed.synd.SyndCategory; +import com.sun.syndication.feed.synd.SyndContent; +import com.sun.syndication.feed.synd.SyndEntry; +import com.sun.syndication.feed.synd.SyndFeed; +import com.sun.syndication.feed.synd.SyndPerson; +import com.sun.syndication.io.SyndFeedInput; + +/** + * + * @author dogacan + * @author mattmann + * @since NUTCH-444 + * + * <p> + * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced + * links and content present in the feed. + * </p> + * + */ +public class FeedParser implements Parser { + + public static final String CHARSET_UTF8 = "charset=UTF-8"; + + public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; " + + CHARSET_UTF8; + + public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class); + + private Configuration conf; + + private ParserFactory parserFactory; + + private URLNormalizers normalizers; + + private URLFilters filters; + + private String defaultEncoding; + + /** + * Parses the given feed and extracts out and parsers all linked items within + * the feed, using the underlying ROME feed parsing library. + * + * @param content + * A {@link Content} object representing the feed that is being + * parsed by this {@link Parser}. + * + * @return A {@link ParseResult} containing all {@link Parse}d feeds that were + * present in the feed file that this {@link Parser} dealt with. + * + */ + public ParseResult getParse(Content content) { + SyndFeed feed = null; + ParseResult parseResult = new ParseResult(content.getUrl()); + + EncodingDetector detector = new EncodingDetector(conf); + detector.autoDetectClues(content, true); + String encoding = detector.guessEncoding(content, defaultEncoding); + try { + InputSource input = new InputSource(new ByteArrayInputStream( + content.getContent())); + input.setEncoding(encoding); + SyndFeedInput feedInput = new SyndFeedInput(); + feed = feedInput.build(input); + } catch (Exception e) { + // return empty parse + LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + + StringUtils.stringifyException(e)); + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + String feedLink = feed.getLink(); + try { + feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK); + if (feedLink != null) + feedLink = filters.filter(feedLink); + } catch (Exception e) { + feedLink = null; + } + + List<?> entries = feed.getEntries(); + for (Object entry : entries) { + addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content); + } + + String feedDesc = stripTags(feed.getDescriptionEx()); + String feedTitle = stripTags(feed.getTitleEx()); + + parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData( + new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], + content.getMetadata())); + + return parseResult; + } + + /** + * + * Sets the {@link Configuration} object for this {@link Parser}. This + * {@link Parser} expects the following configuration properties to be set: + * + * <ul> + * <li>URLNormalizers - properties in the configuration object to set up the + * default url normalizers.</li> + * <li>URLFilters - properties in the configuration object to set up the + * default url filters.</li> + * </ul> + * + * @param conf + * The Hadoop {@link Configuration} object to use to configure this + * {@link Parser}. + * + */ + public void setConf(Configuration conf) { + this.conf = conf; + this.parserFactory = new ParserFactory(conf); + this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK); + this.filters = new URLFilters(conf); + this.defaultEncoding = conf.get("parser.character.encoding.default", + "windows-1252"); + } + + /** + * + * @return The {@link Configuration} object used to configure this + * {@link Parser}. + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Runs a command line version of this {@link Parser}. + * + * @param args + * A single argument (expected at arg[0]) representing a path on the + * local filesystem that points to a feed file. + * + * @throws Exception + * If any error occurs. + */ + public static void main(String[] args) throws Exception { + if (args.length != 1) { + System.err.println("Usage: FeedParser <feed>"); + System.exit(1); + } + String name = args[0]; + String url = "file:" + name; + Configuration conf = NutchConfiguration.create(); + FeedParser parser = new FeedParser(); + parser.setConf(conf); + File file = new File(name); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + ParseResult parseResult = parser.getParse(new Content(url, url, bytes, + "application/rss+xml", new Metadata(), conf)); + for (Entry<Text, Parse> entry : parseResult) { + System.out.println("key: " + entry.getKey()); + Parse parse = entry.getValue(); + System.out.println("data: " + parse.getData()); + System.out.println("text: " + parse.getText() + "\n"); + } + } + + private void addToMap(ParseResult parseResult, SyndFeed feed, + String feedLink, SyndEntry entry, Content content) { + String link = entry.getLink(), text = null, title = null; + Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata(); + Parse parse = null; + SyndContent description = entry.getDescription(); + + try { + link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK); + + if (link != null) + link = filters.filter(link); + } catch (Exception e) { + e.printStackTrace(); + return; + } + + if (link == null) + return; + + title = stripTags(entry.getTitleEx()); + + if (feedLink != null) + parseMeta.set("feed", feedLink); + + addFields(parseMeta, contentMeta, feed, entry); + + // some item descriptions contain markup text in them, + // so we temporarily set their content-type to parse them + // with another plugin + String contentType = contentMeta.get(Response.CONTENT_TYPE); + + if (description != null) + text = description.getValue(); + + if (text == null) { + List<?> contents = entry.getContents(); + StringBuilder buf = new StringBuilder(); + for (Object syndContent : contents) { + buf.append(((SyndContent) syndContent).getValue()); + } + text = buf.toString(); + } + + try { + Parser parser = parserFactory.getParsers(contentType, link)[0]; + parse = parser.getParse( + new Content(link, link, text.getBytes(), contentType, contentMeta, + conf)).get(link); + } catch (ParserNotFound e) { /* ignore */ + } + + if (parse != null) { + ParseData data = parse.getData(); + data.getContentMeta().remove(Response.CONTENT_TYPE); + mergeMetadata(data.getParseMeta(), parseMeta); + parseResult.put(link, new ParseText(parse.getText()), + new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), + data.getContentMeta(), data.getParseMeta())); + } else { + contentMeta.remove(Response.CONTENT_TYPE); + parseResult.put(link, new ParseText(text), new ParseData( + ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, + parseMeta)); + } + + } + + private static String stripTags(SyndContent c) { + if (c == null) + return ""; + + String value = c.getValue(); + + String[] parts = value.split("<[^>]*>"); + StringBuffer buf = new StringBuffer(); + + for (String part : parts) + buf.append(part); + + return buf.toString().trim(); + } + + private void addFields(Metadata parseMeta, Metadata contentMeta, + SyndFeed feed, SyndEntry entry) { + List<?> authors = entry.getAuthors(), categories = entry.getCategories(); + Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate(); + String contentType = null; + + if (authors != null) { + for (Object o : authors) { + SyndPerson author = (SyndPerson) o; + String authorName = author.getName(); + if (checkString(authorName)) { + parseMeta.add(Feed.FEED_AUTHOR, authorName); + } + } + } else { + // getAuthors may return null if feed is non-atom + // if so, call getAuthor to get Dublin Core module creator. + String authorName = entry.getAuthor(); + if (checkString(authorName)) { + parseMeta.set(Feed.FEED_AUTHOR, authorName); + } + } + + for (Object i : categories) { + parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName()); + } + + if (published != null) { + parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime())); + } + if (updated != null) { + parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime())); + } + + SyndContent description = entry.getDescription(); + if (description != null) { + contentType = description.getType(); + } else { + // TODO: What to do if contents.size() > 1? + List<?> contents = entry.getContents(); + if (contents.size() > 0) { + contentType = ((SyndContent) contents.get(0)).getType(); + } + } + + if (checkString(contentType)) { + // ROME may return content-type as html + if (contentType.equals("html")) + contentType = "text/html"; + else if (contentType.equals("xhtml")) + contentType = "text/xhtml"; + contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8); + } else { + contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE); + } + + } + + private void mergeMetadata(Metadata first, Metadata second) { + for (String name : second.names()) { + String[] values = second.getValues(name); + for (String value : values) { + first.add(name, value); + } + } + } + + private boolean checkString(String s) { + return s != null && !s.equals(""); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java new file mode 100644 index 0000000..3b15968 --- /dev/null +++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse RSS feeds. + */ +package org.apache.nutch.parse.feed; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java new file mode 100644 index 0000000..36c8739 --- /dev/null +++ b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.feed; + +// JDK imports +import java.util.Iterator; +import java.util.Map; + +import org.junit.Assert; +import org.junit.Test; +// APACHE imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolNotFound; +import org.apache.nutch.util.NutchConfiguration; + +/** + * + * @author mattmann + * + * Test Suite for the {@link FeedParser}. + * + */ +public class TestFeedParser { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/feed/build.xml during plugin compilation. + + private String[] sampleFiles = { "rsstest.rss" }; + + public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class + .getName()); + + /** + * Calls the {@link FeedParser} on a sample RSS file and checks that there are + * 3 {@link ParseResult} entries including the below 2 links: + * <ul> + * <li>http://www-scf.usc.edu/~mattmann/</li> + * <li>http://www.nutch.org</li> + * </ul> + * + * + * @throws ProtocolNotFound + * If the {@link Protocol}Layer cannot be loaded (required to fetch + * the {@link Content} for the RSS file). + * @throws ParseException + * If the {@link Parser}Layer cannot be loaded. + */ + @Test + public void testParseFetchChannel() throws ProtocolNotFound, ParseException { + String urlString; + Protocol protocol; + Content content; + ParseResult parseResult; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + urlString = urlString.replace('\\', '/'); + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + + parseResult = new ParseUtil(conf).parseByExtensionId("feed", content); + + Assert.assertEquals(3, parseResult.size()); + + boolean hasLink1 = false, hasLink2 = false, hasLink3 = false; + + for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j + .hasNext();) { + Map.Entry<Text, Parse> entry = j.next(); + if (entry.getKey().toString() + .equals("http://www-scf.usc.edu/~mattmann/")) { + hasLink1 = true; + } else if (entry.getKey().toString().equals("http://www.nutch.org/")) { + hasLink2 = true; + } else if (entry.getKey().toString().equals(urlString)) { + hasLink3 = true; + } + + Assert.assertNotNull(entry.getValue()); + Assert.assertNotNull(entry.getValue().getData()); + } + + if (!hasLink1 || !hasLink2 || !hasLink3) { + Assert.fail("Outlinks read from sample rss file are not correct!"); + } + } + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/headings/build.xml b/nutch-plugins/headings/build.xml new file mode 100644 index 0000000..d334ad1 --- /dev/null +++ b/nutch-plugins/headings/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="headings" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/headings/ivy.xml b/nutch-plugins/headings/ivy.xml new file mode 100644 index 0000000..5b8393b --- /dev/null +++ b/nutch-plugins/headings/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/headings/plugin.xml b/nutch-plugins/headings/plugin.xml new file mode 100644 index 0000000..0d7921a --- /dev/null +++ b/nutch-plugins/headings/plugin.xml @@ -0,0 +1,45 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="headings" + name="Headings Parse Filter" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="headings.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.parse.headings" + name="Nutch Headings Parse Filter" + point="org.apache.nutch.parse.HtmlParseFilter"> + + <implementation id="HeadingsParseFilter" + class="org.apache.nutch.parse.headings.HeadingsParseFilter"> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/headings/pom.xml b/nutch-plugins/headings/pom.xml new file mode 100644 index 0000000..219eb71 --- /dev/null +++ b/nutch-plugins/headings/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>headings</artifactId> + <packaging>jar</packaging> + + <name>headings</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java new file mode 100644 index 0000000..657f260 --- /dev/null +++ b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.headings; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NodeWalker; +import org.w3c.dom.*; + +/** + * HtmlParseFilter to retrieve h1 and h2 values from the DOM. + */ +public class HeadingsParseFilter implements HtmlParseFilter { + + /** + * Pattern used to strip surpluss whitespace + */ + protected static Pattern whitespacePattern = Pattern.compile("\\s+"); + + private Configuration conf; + private String[] headings; + private boolean multiValued = false; + + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + Parse parse = parseResult.get(content.getUrl()); + + for (int i = 0; headings != null && i < headings.length; i++) { + List<String> discoveredHeadings = getElement(doc, headings[i]); + + if (discoveredHeadings.size() > 0) { + for (String heading : discoveredHeadings) { + if (heading != null) { + heading.trim(); + + if (heading.length() > 0) { + parse.getData().getParseMeta().add(headings[i], heading); + } + } + } + } + } + + return parseResult; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + headings = conf.getStrings("headings"); + multiValued = conf.getBoolean("headings.multivalued", false); + } + + public Configuration getConf() { + return this.conf; + } + + /** + * Finds the specified element and returns its value + */ + protected List<String> getElement(DocumentFragment doc, String element) { + List<String> headings = new ArrayList<String>(); + NodeWalker walker = new NodeWalker(doc); + + while (walker.hasNext()) { + Node currentNode = walker.nextNode(); + + if (currentNode.getNodeType() == Node.ELEMENT_NODE) { + if (element.equalsIgnoreCase(currentNode.getNodeName())) { + headings.add(getNodeValue(currentNode)); + + // Check for multiValued here, if disabled we don't need + // to discover more headings. + if (!multiValued) { + break; + } + } + } + } + + return headings; + } + + /** + * Returns the text value of the specified Node and child nodes + */ + protected static String getNodeValue(Node node) { + StringBuilder buffer = new StringBuilder(); + + NodeList children = node.getChildNodes(); + + for (int i = 0; i < children.getLength(); i++) { + if (children.item(i).getNodeType() == Node.TEXT_NODE) { + buffer.append(children.item(i).getNodeValue()); + } + } + + // Return with stripped surplus whitespace + Matcher matcher = whitespacePattern.matcher(buffer.toString().trim()); + return matcher.replaceAll(" ").trim(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java new file mode 100644 index 0000000..363e0b2 --- /dev/null +++ b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree. + */ +package org.apache.nutch.parse.headings; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/build.xml b/nutch-plugins/index-anchor/build.xml new file mode 100644 index 0000000..597b532 --- /dev/null +++ b/nutch-plugins/index-anchor/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-anchor" default="jar-core"> + + <import file="../build-plugin.xml" /> + +</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/ivy.xml b/nutch-plugins/index-anchor/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/index-anchor/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/plugin.xml b/nutch-plugins/index-anchor/plugin.xml new file mode 100644 index 0000000..208594b --- /dev/null +++ b/nutch-plugins/index-anchor/plugin.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin id="index-anchor" name="Anchor Indexing Filter" version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="index-anchor.jar"> + <export name="*" /> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints" /> + </requires> + + <extension id="org.apache.nutch.indexer.anchor" + name="Nutch Anchor Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="AnchorIndexingFilter" + class="org.apache.nutch.indexer.anchor.AnchorIndexingFilter" /> + </extension> + +</plugin> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/pom.xml b/nutch-plugins/index-anchor/pom.xml new file mode 100644 index 0000000..df01a61 --- /dev/null +++ b/nutch-plugins/index-anchor/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-anchor</artifactId> + <packaging>jar</packaging> + + <name>index-anchor</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java new file mode 100644 index 0000000..6c9b834 --- /dev/null +++ b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.anchor; + +import java.util.HashSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Indexing filter that offers an option to either index all inbound anchor text + * for a document or deduplicate anchors. Deduplication does have it's con's, + * + * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. + */ +public class AnchorIndexingFilter implements IndexingFilter { + + public static final Logger LOG = LoggerFactory + .getLogger(AnchorIndexingFilter.class); + private Configuration conf; + private boolean deduplicate = false; + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + + deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false); + LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off")); + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + + /** + * The {@link AnchorIndexingFilter} filter object which supports boolean + * configuration settings for the deduplication of anchors. See + * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered for anchor text + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]); + + HashSet<String> set = null; + + for (int i = 0; i < anchors.length; i++) { + if (deduplicate) { + if (set == null) + set = new HashSet<String>(); + String lcAnchor = anchors[i].toLowerCase(); + + // Check if already processed the current anchor + if (!set.contains(lcAnchor)) { + doc.add("anchor", anchors[i]); + + // Add to map + set.add(lcAnchor); + } + } else { + doc.add("anchor", anchors[i]); + } + } + + return doc; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html new file mode 100644 index 0000000..c255029 --- /dev/null +++ b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>An indexing plugin for inbound anchor text.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java new file mode 100644 index 0000000..08a42f3 --- /dev/null +++ b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.anchor; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit test case which tests 1. that anchor text is obtained 2. that anchor + * deduplication functionality is working + * + * @author lewismc + * + */ +public class TestAnchorIndexingFilter { + + @Test + public void testDeduplicateAnchor() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("anchorIndexingFilter.deduplicate", true); + AnchorIndexingFilter filter = new AnchorIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + ParseImpl parse = new ParseImpl("foo bar", new ParseData()); + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://test1.com/", "text1")); + inlinks.add(new Inlink("http://test2.com/", "text2")); + inlinks.add(new Inlink("http://test3.com/", "text2")); + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames() + .contains("anchor")); + Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor") + .getValues().size()); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/build.xml b/nutch-plugins/index-basic/build.xml new file mode 100755 index 0000000..a834290 --- /dev/null +++ b/nutch-plugins/index-basic/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-basic" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/ivy.xml b/nutch-plugins/index-basic/ivy.xml new file mode 100644 index 0000000..848216e --- /dev/null +++ b/nutch-plugins/index-basic/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/plugin.xml b/nutch-plugins/index-basic/plugin.xml new file mode 100755 index 0000000..c5d784d --- /dev/null +++ b/nutch-plugins/index-basic/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-basic" + name="Basic Indexing Filter" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="index-basic.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.basic" + name="Nutch Basic Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="BasicIndexingFilter" + class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/> + </extension> + +</plugin>
