http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/build.xml b/src/plugin/build.xml deleted file mode 100755 index 75ae2e7..0000000 --- a/src/plugin/build.xml +++ /dev/null @@ -1,213 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="Nutch" default="deploy-core" basedir="."> - - <target name="deploy-core"> - <ant target="compile-core" inheritall="false" dir="../.."/> - <ant target="deploy"/> - </target> - - <!-- ====================================================== --> - <!-- Build & deploy all the plugin jars. --> - <!-- ====================================================== --> - <target name="deploy"> - <ant dir="creativecommons" target="deploy"/> - <ant dir="feed" target="deploy"/> - <ant dir="headings" target="deploy"/> - <ant dir="index-basic" target="deploy"/> - <ant dir="index-anchor" target="deploy"/> - <ant dir="index-geoip" target="deploy"/> - <ant dir="index-more" target="deploy"/> - <ant dir="index-replace" target="deploy"/> - <ant dir="index-static" target="deploy"/> - <ant dir="index-metadata" target="deploy"/> - <ant dir="index-links" target="deploy"/> - <ant dir="mimetype-filter" target="deploy"/> - <ant dir="indexer-cloudsearch" target="deploy"/> - <ant dir="indexer-dummy" target="deploy"/> - <ant dir="indexer-elastic" target="deploy"/> - <ant dir="indexer-solr" target="deploy"/> - <ant dir="language-identifier" target="deploy"/> - <ant dir="lib-http" target="deploy"/> - <ant dir="lib-nekohtml" target="deploy"/> - <ant dir="lib-regex-filter" target="deploy"/> - <ant dir="lib-xml" target="deploy"/> - <ant dir="microformats-reltag" target="deploy"/> - <ant dir="nutch-extensionpoints" target="deploy"/> - <ant dir="protocol-file" target="deploy"/> - <ant dir="protocol-ftp" target="deploy"/> - <ant dir="protocol-http" target="deploy"/> - <ant dir="protocol-httpclient" target="deploy"/> - <ant dir="lib-htmlunit" target="deploy"/> - <ant dir="protocol-htmlunit" target="deploy" /> - <ant dir="lib-selenium" target="deploy"/> - <ant dir="protocol-selenium" target="deploy" /> - <ant dir="protocol-interactiveselenium" target="deploy" /> - <ant dir="parse-ext" target="deploy"/> - <ant dir="parse-js" target="deploy"/> - <ant dir="parse-html" target="deploy"/> - <ant dir="parse-metatags" target="deploy"/> - <ant dir="parse-swf" target="deploy"/> - <ant dir="parse-tika" target="deploy"/> - <ant dir="parse-zip" target="deploy"/> - <ant dir="scoring-depth" target="deploy"/> - <ant dir="scoring-opic" target="deploy"/> - <ant dir="scoring-link" target="deploy"/> - <ant dir="scoring-similarity" target="deploy"/> - <ant dir="subcollection" target="deploy"/> - <ant dir="tld" target="deploy"/> - <ant dir="urlfilter-automaton" target="deploy"/> - <ant dir="urlfilter-domain" target="deploy" /> - <ant dir="urlfilter-domainblacklist" target="deploy" /> - <ant dir="urlfilter-prefix" target="deploy"/> - <ant dir="urlfilter-regex" target="deploy"/> - <ant dir="urlfilter-suffix" target="deploy"/> - <ant dir="urlfilter-validator" target="deploy"/> - <ant dir="urlfilter-ignoreexempt" target="deploy"/> - <ant dir="parsefilter-naivebayes" target="deploy"/> - <ant dir="parsefilter-regex" target="deploy"/> - <ant dir="urlmeta" target="deploy"/> - <ant dir="urlnormalizer-ajax" target="deploy"/> - <ant dir="urlnormalizer-basic" target="deploy"/> - <ant dir="urlnormalizer-host" target="deploy"/> - <ant dir="urlnormalizer-pass" target="deploy"/> - <ant dir="urlnormalizer-protocol" target="deploy"/> - <ant dir="urlnormalizer-querystring" target="deploy"/> - <ant dir="urlnormalizer-regex" target="deploy"/> - <ant dir="urlnormalizer-slash" target="deploy"/> - </target> - - <!-- ====================================================== --> - <!-- Test all of the plugins. --> - <!-- ====================================================== --> - <target name="test"> - <parallel threadCount="2"> - <ant dir="creativecommons" target="test"/> - <ant dir="index-basic" target="test"/> - <ant dir="index-anchor" target="test"/> - <ant dir="index-geoip" target="test"/> - <ant dir="index-more" target="test"/> - <ant dir="index-static" target="test"/> - <ant dir="index-replace" target="test"/> - <ant dir="index-links" target="test"/> - <ant dir="mimetype-filter" target="test"/> - <ant dir="language-identifier" target="test"/> - <ant dir="lib-http" target="test"/> - <ant dir="protocol-file" target="test"/> - <ant dir="protocol-http" target="test"/> - <ant dir="protocol-httpclient" target="test"/> - <!--ant dir="parse-ext" target="test"/--> - <ant dir="feed" target="test"/> - <ant dir="parse-html" target="test"/> - <ant dir="parse-metatags" target="test"/> - <ant dir="parse-swf" target="test"/> - <ant dir="parse-tika" target="test"/> - <ant dir="parse-zip" target="test"/> - <ant dir="parsefilter-regex" target="test"/> - <ant dir="subcollection" target="test"/> - <ant dir="urlfilter-automaton" target="test"/> - <ant dir="urlfilter-domain" target="test"/> - <ant dir="urlfilter-domainblacklist" target="test"/> - <ant dir="urlfilter-prefix" target="test"/> - <ant dir="urlfilter-regex" target="test"/> - <ant dir="urlfilter-suffix" target="test"/> - <ant dir="urlfilter-validator" target="test"/> - <ant dir="urlfilter-ignoreexempt" target="test"/> - <ant dir="urlnormalizer-ajax" target="test"/> - <ant dir="urlnormalizer-basic" target="test"/> - <ant dir="urlnormalizer-host" target="test"/> - <ant dir="urlnormalizer-pass" target="test"/> - <ant dir="urlnormalizer-protocol" target="test"/> - <ant dir="urlnormalizer-querystring" target="test"/> - <ant dir="urlnormalizer-regex" target="test"/> - <ant dir="urlnormalizer-slash" target="test"/> - </parallel> - </target> - - <!-- ====================================================== --> - <!-- Clean all of the plugins. --> - <!-- ====================================================== --> - <target name="clean"> - <ant dir="creativecommons" target="clean"/> - <ant dir="feed" target="clean"/> - <ant dir="headings" target="clean"/> - <ant dir="index-basic" target="clean"/> - <ant dir="index-anchor" target="clean"/> - <ant dir="index-geoip" target="clean"/> - <ant dir="index-more" target="clean"/> - <ant dir="index-static" target="clean"/> - <ant dir="index-replace" target="clean"/> - <ant dir="index-metadata" target="clean"/> - <ant dir="index-links" target="clean"/> - <ant dir="mimetype-filter" target="clean"/> - <ant dir="indexer-cloudsearch" target="clean"/> - <ant dir="indexer-dummy" target="clean"/> - <ant dir="indexer-elastic" target="clean"/> - <ant dir="indexer-solr" target="clean"/> - <ant dir="language-identifier" target="clean"/> - <!-- <ant dir="lib-commons-httpclient" target="clean"/> --> - <ant dir="lib-http" target="clean"/> - <!-- <ant dir="lib-lucene-analyzers" target="clean"/>--> - <ant dir="lib-nekohtml" target="clean"/> - <ant dir="lib-regex-filter" target="clean"/> - <ant dir="lib-xml" target="clean"/> - <ant dir="microformats-reltag" target="clean"/> - <ant dir="nutch-extensionpoints" target="clean"/> - <ant dir="protocol-file" target="clean"/> - <ant dir="protocol-ftp" target="clean"/> - <ant dir="protocol-http" target="clean"/> - <ant dir="protocol-httpclient" target="clean"/> - <ant dir="lib-htmlunit" target="clean"/> - <ant dir="protocol-htmlunit" target="clean" /> - <ant dir="lib-selenium" target="clean"/> - <ant dir="protocol-selenium" target="clean" /> - <ant dir="protocol-interactiveselenium" target="clean" /> - <ant dir="parse-ext" target="clean"/> - <ant dir="parse-js" target="clean"/> - <ant dir="parse-html" target="clean"/> - <ant dir="parse-metatags" target="clean"/> - <ant dir="parse-swf" target="clean"/> - <ant dir="parse-tika" target="clean"/> - <ant dir="parse-zip" target="clean"/> - <ant dir="parsefilter-regex" target="clean"/> - <ant dir="scoring-depth" target="clean"/> - <ant dir="scoring-opic" target="clean"/> - <ant dir="scoring-link" target="clean"/> - <ant dir="scoring-similarity" target="clean"/> - <ant dir="subcollection" target="clean"/> - <ant dir="tld" target="clean"/> - <ant dir="urlfilter-automaton" target="clean"/> - <ant dir="urlfilter-domain" target="clean" /> - <ant dir="urlfilter-domainblacklist" target="clean" /> - <ant dir="urlfilter-prefix" target="clean"/> - <ant dir="urlfilter-regex" target="clean"/> - <ant dir="urlfilter-suffix" target="clean"/> - <ant dir="urlfilter-validator" target="clean"/> - <ant dir="urlfilter-ignoreexempt" target="clean"/> - <ant dir="parsefilter-naivebayes" target="clean" /> - <ant dir="urlmeta" target="clean"/> - <ant dir="urlnormalizer-ajax" target="clean"/> - <ant dir="urlnormalizer-basic" target="clean"/> - <ant dir="urlnormalizer-host" target="clean"/> - <ant dir="urlnormalizer-pass" target="clean"/> - <ant dir="urlnormalizer-protocol" target="clean"/> - <ant dir="urlnormalizer-querystring" target="clean"/> - <ant dir="urlnormalizer-regex" target="clean"/> - <ant dir="urlnormalizer-slash" target="clean"/> - </target> -</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/README.txt ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/README.txt b/src/plugin/creativecommons/README.txt deleted file mode 100644 index d4d7b65..0000000 --- a/src/plugin/creativecommons/README.txt +++ /dev/null @@ -1 +0,0 @@ -Support for crawling and searching Creative-Commons licensed content. http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/build.xml b/src/plugin/creativecommons/build.xml deleted file mode 100755 index 6443d7f..0000000 --- a/src/plugin/creativecommons/build.xml +++ /dev/null @@ -1,28 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="creativecommons" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <!-- <ant target="deploy" inheritall="false" dir="../parse-html"/> --> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/conf/crawl-urlfilter.txt ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt deleted file mode 100644 index 324617f..0000000 --- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt +++ /dev/null @@ -1,18 +0,0 @@ -# Creative Commnons crawl filter - -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is included or ignored. If no pattern -# matches, the URL is ignored. - -# skip file:, ftp:, & mailto: urls --^(file|ftp|mailto|https): - -# skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$ - -# skip URLs containing certain characters as probable queries, etc. --[?*!@=] - -# accept anything else -+. http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/conf/nutch-site.xml ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml deleted file mode 100644 index 71e344b..0000000 --- a/src/plugin/creativecommons/conf/nutch-site.xml +++ /dev/null @@ -1,50 +0,0 @@ -<?xml version="1.0"?> -<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?> - -<!-- Creative Commons' Nutch configuration --> - -<nutch-conf> - -<property> - <name>http.agent.name</name> - <value>CreativeCommons</value> - <description>Our HTTP 'User-Agent' request header.</description> -</property> - -<property> - <name>http.robots.agents</name> - <value>CreativeCommons,Nutch,*</value> - <description>The agent strings we'll look for in robots.txt files, - comma-separated, in decreasing order of precedence.</description> -</property> - -<property> - <name>fetcher.server.delay</name> - <value>2.0</value> - <description>We need to be more polite than when crawling an - intranet that we control.</description> -</property> - -<property> - <name>http.max.delays</name> - <value>3</value> - <description>The CC crawl visits a large number of different - hosts, so we should not need to delay much.</description> -</property> - -<property> - <name>creativecommons.exclude.unlicensed</name> - <value>true</value> - <description>Exclude HTML content which does not contain a CC license. - </description> -</property> - -<property> - <name>plugin.excludes</name> - <value>parse-(?!html).*</value> - <description>Exclude non-HTML content, since we don't know how to - find a CC license in anything but HTML. - </description> -</property> - -</nutch-conf> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/anchor.html ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html deleted file mode 100755 index 90b5227..0000000 --- a/src/plugin/creativecommons/data/anchor.html +++ /dev/null @@ -1,9 +0,0 @@ -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd"> -<html> -<head> -</head> -<body> -<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a -<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/rdf.html ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html deleted file mode 100755 index fb2c34d..0000000 --- a/src/plugin/creativecommons/data/rdf.html +++ /dev/null @@ -1,35 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<html> - <head> - </head> - <body> - -<!-- Creative Commons License --> -<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br /> -This work is licensed under a -<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>. -<!-- end Creative Commons License --> - - <!-- -<rdf:RDF xmlns="http://web.resource.org/cc/" - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> -<Work rdf:about="http://boingboing.net"> - <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" /> - <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" /> -</Work> - -<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0"> - <requires rdf:resource="http://web.resource.org/cc/Attribution" /> - <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" /> - <permits rdf:resource="http://web.resource.org/cc/Reproduction" /> - <permits rdf:resource="http://web.resource.org/cc/Distribution" /> - <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" /> - <requires rdf:resource="http://web.resource.org/cc/Notice" /> -</License> - -</rdf:RDF> - ---> - </body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/rel.html ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html deleted file mode 100755 index 413d52f..0000000 --- a/src/plugin/creativecommons/data/rel.html +++ /dev/null @@ -1,6 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head> -</head><body> -<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/creativecommons/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/plugin.xml b/src/plugin/creativecommons/plugin.xml deleted file mode 100755 index de9cf36..0000000 --- a/src/plugin/creativecommons/plugin.xml +++ /dev/null @@ -1,48 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="creativecommons" - name="Creative Commons Plugins" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="creativecommons.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.creativecommons.nutch.CCParseFilter" - name="Creative Commons Metadata Filter" - point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="CCParseFilter" - class="org.creativecommons.nutch.CCParseFilter"/> - </extension> - - <extension id="org.creativecommons.nutch.CCIndexingFilter" - name="Creative Commons Indexing Filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="CCIndexingFilter" - class="org.creativecommons.nutch.CCIndexingFilter"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java deleted file mode 100644 index e7c55c4..0000000 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.creativecommons.nutch; - -import org.apache.nutch.metadata.CreativeCommons; - -import org.apache.nutch.parse.Parse; - -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.hadoop.io.Text; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.metadata.Metadata; - -import org.apache.hadoop.conf.Configuration; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.net.URL; -import java.net.MalformedURLException; - -/** Adds basic searchable fields to a document. */ -public class CCIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory - .getLogger(CCIndexingFilter.class); - - /** The name of the document field we use. */ - public static String FIELD = "cc"; - - private Configuration conf; - - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - Metadata metadata = parse.getData().getParseMeta(); - // index the license - String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); - if (licenseUrl != null) { - if (LOG.isInfoEnabled()) { - LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); - } - - // add the entire license as cc:license=xxx - addFeature(doc, "license=" + licenseUrl); - - // index license attributes extracted of the license url - addUrlFeatures(doc, licenseUrl); - } - - // index the license location as cc:meta=xxx - String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); - if (licenseLocation != null) { - addFeature(doc, "meta=" + licenseLocation); - } - - // index the work type cc:type=xxx - String workType = metadata.get(CreativeCommons.WORK_TYPE); - if (workType != null) { - addFeature(doc, workType); - } - - return doc; - } - - /** - * Add the features represented by a license URL. Urls are of the form - * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a - * license feature. - */ - public void addUrlFeatures(NutchDocument doc, String urlString) { - try { - URL url = new URL(urlString); - - // tokenize the path of the url, breaking at slashes and dashes - StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); - - if (names.hasMoreTokens()) - names.nextToken(); // throw away "licenses" - - // add a feature per component after "licenses" - while (names.hasMoreTokens()) { - String feature = names.nextToken(); - addFeature(doc, feature); - } - } catch (MalformedURLException e) { - if (LOG.isWarnEnabled()) { - LOG.warn("CC: failed to parse url: " + urlString + " : " + e); - } - } - } - - private void addFeature(NutchDocument doc, String feature) { - doc.add(FIELD, feature); - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java deleted file mode 100644 index 1fa951e..0000000 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java +++ /dev/null @@ -1,300 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.creativecommons.nutch; - -import org.apache.nutch.metadata.CreativeCommons; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.metadata.Metadata; -import org.apache.hadoop.conf.Configuration; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.io.*; -import java.net.*; -import javax.xml.parsers.*; -import org.xml.sax.InputSource; -import org.w3c.dom.*; - -/** Adds metadata identifying the Creative Commons license used, if any. */ -public class CCParseFilter implements HtmlParseFilter { - public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class); - - /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */ - public static class Walker { - private URL base; // base url of page - private String rdfLicense; // subject url found, if any - private URL relLicense; // license url found, if any - private URL anchorLicense; // anchor url found, if any - private String workType; // work type URI - - private Walker(URL base) { - this.base = base; - } - - /** Scan the document adding attributes to metadata. */ - public static void walk(Node doc, URL base, Metadata metadata, - Configuration conf) throws ParseException { - - // walk the DOM tree, scanning for license data - Walker walker = new Walker(base); - walker.walk(doc); - - // interpret results of walk - String licenseUrl = null; - String licenseLocation = null; - if (walker.rdfLicense != null) { // 1st choice: subject in RDF - licenseLocation = "rdf"; - licenseUrl = walker.rdfLicense; - } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license - licenseLocation = "rel"; - licenseUrl = walker.relLicense.toString(); - } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license - licenseLocation = "a"; - licenseUrl = walker.anchorLicense.toString(); - } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) { - throw new ParseException("No CC license. Excluding."); - } - - // add license to metadata - if (licenseUrl != null) { - if (LOG.isInfoEnabled()) { - LOG.info("CC: found " + licenseUrl + " in " + licenseLocation - + " of " + base); - } - metadata.add(CreativeCommons.LICENSE_URL, licenseUrl); - metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation); - } - - if (walker.workType != null) { - if (LOG.isInfoEnabled()) { - LOG.info("CC: found " + walker.workType + " in " + base); - } - metadata.add(CreativeCommons.WORK_TYPE, walker.workType); - } - - } - - /** Scan the document looking for RDF in comments and license elements. */ - private void walk(Node node) { - - // check element nodes for license URL - if (node instanceof Element) { - findLicenseUrl((Element) node); - } - - // check comment nodes for license RDF - if (node instanceof Comment) { - findRdf(((Comment) node).getData()); - } - - // recursively walk child nodes - NodeList children = node.getChildNodes(); - for (int i = 0; children != null && i < children.getLength(); i++) { - walk(children.item(i)); - } - } - - /** - * Extract license url from element, if any. Thse are the href attribute of - * anchor elements with rel="license". These must also point to - * http://creativecommons.org/licenses/. - */ - private void findLicenseUrl(Element element) { - // only look in Anchor elements - if (!"a".equalsIgnoreCase(element.getTagName())) - return; - - // require an href - String href = element.getAttribute("href"); - if (href == null) - return; - - try { - URL url = new URL(base, href); // resolve the url - - // check that it's a CC license URL - if ("http".equalsIgnoreCase(url.getProtocol()) - && "creativecommons.org".equalsIgnoreCase(url.getHost()) - && url.getPath() != null && url.getPath().startsWith("/licenses/") - && url.getPath().length() > "/licenses/".length()) { - - // check rel="license" - String rel = element.getAttribute("rel"); - if (rel != null && "license".equals(rel) && this.relLicense == null) { - this.relLicense = url; // found rel license - } else if (this.anchorLicense == null) { - this.anchorLicense = url; // found anchor license - } - } - } catch (MalformedURLException e) { // ignore malformed urls - } - } - - /** Configure a namespace aware XML parser. */ - private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory - .newInstance(); - static { - FACTORY.setNamespaceAware(true); - } - - /** Creative Commons' namespace URI. */ - private static final String CC_NS = "http://web.resource.org/cc/"; - - /** Dublin Core namespace URI. */ - private static final String DC_NS = "http://purl.org/dc/elements/1.1/"; - - /** RDF syntax namespace URI. */ - private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; - - private void findRdf(String comment) { - // first check for likely RDF in comment - int rdfPosition = comment.indexOf("RDF"); - if (rdfPosition < 0) - return; // no RDF, abort - int nsPosition = comment.indexOf(CC_NS); - if (nsPosition < 0) - return; // no RDF, abort - - // try to parse the XML - Document doc; - try { - DocumentBuilder parser = FACTORY.newDocumentBuilder(); - doc = parser.parse(new InputSource(new StringReader(comment))); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - LOG.warn("CC: Failed to parse RDF in " + base + ": " + e); - } - // e.printStackTrace(); - return; - } - - // check that root is rdf:RDF - NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); - if (roots.getLength() != 1) { - if (LOG.isWarnEnabled()) { - LOG.warn("CC: No RDF root in " + base); - } - return; - } - Element rdf = (Element) roots.item(0); - - // get cc:License nodes inside rdf:RDF - NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); - for (int i = 0; i < licenses.getLength(); i++) { - - Element l = (Element) licenses.item(i); - - // license is rdf:about= attribute from cc:License - this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue(); - - // walk predicates of cc:License - NodeList predicates = l.getChildNodes(); - for (int j = 0; j < predicates.getLength(); j++) { - Node predicateNode = predicates.item(j); - if (!(predicateNode instanceof Element)) - continue; - Element predicateElement = (Element) predicateNode; - - // extract predicates of cc:xxx predicates - if (!CC_NS.equals(predicateElement.getNamespaceURI())) { - continue; - } - - // add object and predicate to metadata - // metadata.put(object, predicate); - // if (LOG.isInfoEnabled()) { - // LOG.info("CC: found: "+predicate+"="+object); - // } - } - } - - // get cc:Work nodes from rdf:RDF - NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work"); - for (int i = 0; i < works.getLength(); i++) { - // get dc:type nodes from cc:Work - NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); - - for (int j = 0; j < types.getLength(); j++) { - Element type = (Element) types.item(j); - String workUri = type.getAttributeNodeNS(RDF_NS, "resource") - .getValue(); - this.workType = WORK_TYPE_NAMES.get(workUri); - } - } - } - } - - private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>(); - static { - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", - "interactive"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); - } - - private Configuration conf; - - /** - * Adds metadata or otherwise modifies a parse of an HTML document, given the - * DOM tree of a page. - */ - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - - // get parse obj - Parse parse = parseResult.get(content.getUrl()); - - // construct base url - URL base; - try { - base = new URL(content.getBaseUrl()); - } catch (MalformedURLException e) { - Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); - parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), - emptyParse.getData()); - return parseResult; - } - - try { - // extract license metadata - Walker.walk(doc, base, parse.getData().getParseMeta(), getConf()); - } catch (ParseException e) { - Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); - parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), - emptyParse.getData()); - return parseResult; - } - - return parseResult; - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html deleted file mode 100644 index 0c91293..0000000 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Sample plugins that parse and index Creative Commons medadata.</p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java deleted file mode 100755 index 41be9ed..0000000 --- a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.creativecommons.nutch; - -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -import java.io.*; - -public class TestCCParseFilter { - - private static final File testDir = new File(System.getProperty("test.input")); - - @Test - public void testPages() throws Exception { - pageTest(new File(testDir, "anchor.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); - // Tika returns <a> whereas parse-html returns <rel> - // check later - pageTest(new File(testDir, "rel.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); - // Tika returns <a> whereas parse-html returns <rdf> - // check later - pageTest(new File(testDir, "rdf.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); - } - - public void pageTest(File file, String url, String license, String location, - String type) throws Exception { - - String contentType = "text/html"; - InputStream in = new FileInputStream(file); - ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length()); - byte[] buffer = new byte[1024]; - int i; - while ((i = in.read(buffer)) != -1) { - out.write(buffer, 0, i); - } - in.close(); - byte[] bytes = out.toByteArray(); - Configuration conf = NutchConfiguration.create(); - - Content content = new Content(url, url, bytes, contentType, new Metadata(), - conf); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - - Metadata metadata = parse.getData().getParseMeta(); - Assert.assertEquals(license, metadata.get("License-Url")); - Assert.assertEquals(location, metadata.get("License-Location")); - Assert.assertEquals(type, metadata.get("Work-Type")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/feed/build.xml b/src/plugin/feed/build.xml deleted file mode 100644 index 7fe7050..0000000 --- a/src/plugin/feed/build.xml +++ /dev/null @@ -1,45 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<project name="feed" default="jar-core"> - - <import file="../build-plugin.xml" /> - - <!-- Build compilation dependencies --> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-xml"/> - </target> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-xml/*.jar" /> - </fileset> - </path> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" - dir="../nutch-extensionpoints" /> - <ant target="deploy" inheritall="false" dir="../protocol-file" /> - </target> - - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy file="sample/rsstest.rss" todir="${build.test}/data" /> -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/feed/ivy.xml b/src/plugin/feed/ivy.xml deleted file mode 100644 index c29bd03..0000000 --- a/src/plugin/feed/ivy.xml +++ /dev/null @@ -1,43 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="rome" name="rome" rev="0.9" conf="*->master"/> - <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->master"/> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/feed/plugin.xml b/src/plugin/feed/plugin.xml deleted file mode 100644 index 3a68d8d..0000000 --- a/src/plugin/feed/plugin.xml +++ /dev/null @@ -1,49 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0" - provider-name="nutch.org"> - <runtime> - <library name="feed.jar"> - <export name="*" /> - </library> - <library name="rome-0.9.jar" /> - <library name="jdom-1.1.jar" /> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints" /> - <import plugin="lib-xml" /> - </requires> - - <extension id="org.apache.nutch.parse.feed" name="Feed Parser" - point="org.apache.nutch.parse.Parser"> - - <implementation id="org.apache.nutch.parse.feed.FeedParser" - class="org.apache.nutch.parse.feed.FeedParser"> - <parameter name="contentType" value="application/rss+xml" /> - <parameter name="contentType" value="application/atom+xml" /> - <parameter name="contentType" value="text/xml" /> - <parameter name="pathSuffix" value="rss" /> - </implementation> - </extension> - <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="FeedIndexingFilter" - class="org.apache.nutch.indexer.feed.FeedIndexingFilter" /> - </extension> -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/sample/rsstest.rss ---------------------------------------------------------------------- diff --git a/src/plugin/feed/sample/rsstest.rss b/src/plugin/feed/sample/rsstest.rss deleted file mode 100644 index 758f6a1..0000000 --- a/src/plugin/feed/sample/rsstest.rss +++ /dev/null @@ -1,36 +0,0 @@ -<?xml version="1.0" encoding="ISO-8859-1" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<rss version="0.91"> - <channel> - <title>TestChannel</title> - <link>http://test.channel.com/</link> - <description>Sample RSS File for Junit test</description> - <language>en-us</language> - - <item> - <title>Home Page of Chris Mattmann</title> - <link>http://www-scf.usc.edu/~mattmann/</link> - <description>Chris Mattmann's home page</description> - </item> - <item> - <title>Awesome Open Source Search Engine</title> - <link>http://www.nutch.org/</link> - <description>Yup, that's what it is</description> - </item> - </channel> -</rss> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java deleted file mode 100644 index 94b440a..0000000 --- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.feed; - -//JDK imports -import java.util.Date; - -//APACHE imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Feed; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; - -/** - * @author dogacan - * @author mattmann - * @since NUTCH-444 - * - * An {@link IndexingFilter} implementation to pull out the relevant - * extracted {@link Metadata} fields from the RSS feeds and into the - * index. - * - */ -public class FeedIndexingFilter implements IndexingFilter { - - public static final String dateFormatStr = "yyyyMMddHHmm"; - - private Configuration conf; - - private final static String PUBLISHED_DATE = "publishedDate"; - - private final static String UPDATED_DATE = "updatedDate"; - - /** - * Extracts out the relevant fields: - * - * <ul> - * <li>FEED_AUTHOR</li> - * <li>FEED_TAGS</li> - * <li>FEED_PUBLISHED</li> - * <li>FEED_UPDATED</li> - * <li>FEED</li> - * </ul> - * - * And sends them to the {@link Indexer} for indexing within the Nutch index. - * - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - ParseData parseData = parse.getData(); - Metadata parseMeta = parseData.getParseMeta(); - - String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR); - String[] tags = parseMeta.getValues(Feed.FEED_TAGS); - String published = parseMeta.get(Feed.FEED_PUBLISHED); - String updated = parseMeta.get(Feed.FEED_UPDATED); - String feed = parseMeta.get(Feed.FEED); - - if (authors != null) { - for (String author : authors) { - doc.add(Feed.FEED_AUTHOR, author); - } - } - - if (tags != null) { - for (String tag : tags) { - doc.add(Feed.FEED_TAGS, tag); - } - } - - if (feed != null) - doc.add(Feed.FEED, feed); - - if (published != null) { - Date date = new Date(Long.parseLong(published)); - doc.add(PUBLISHED_DATE, date); - } - - if (updated != null) { - Date date = new Date(Long.parseLong(updated)); - doc.add(UPDATED_DATE, date); - } - - return doc; - } - - /** - * @return the {@link Configuration} object used to configure this - * {@link IndexingFilter}. - */ - public Configuration getConf() { - return conf; - } - - /** - * Sets the {@link Configuration} object used to configure this - * {@link IndexingFilter}. - * - * @param conf - * The {@link Configuration} object used to configure this - * {@link IndexingFilter}. - */ - public void setConf(Configuration conf) { - this.conf = conf; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java deleted file mode 100644 index 8f52628..0000000 --- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Indexing filter to index meta data from RSS feeds. - */ -package org.apache.nutch.indexer.feed; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java deleted file mode 100644 index 936c885..0000000 --- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java +++ /dev/null @@ -1,374 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.feed; - -// JDK imports -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.Date; -import java.util.Iterator; -import java.util.List; -import java.util.Map.Entry; - -// APACHE imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.util.StringUtils; -// import org.apache.nutch.indexer.anchor.AnchorIndexingFilter; removed as per NUTCH-1078 -import org.apache.nutch.metadata.Feed; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.ParseText; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.parse.ParserFactory; -import org.apache.nutch.parse.ParserNotFound; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.EncodingDetector; -import org.apache.nutch.util.NutchConfiguration; -import org.xml.sax.InputSource; - -// ROME imports -import com.sun.syndication.feed.synd.SyndCategory; -import com.sun.syndication.feed.synd.SyndContent; -import com.sun.syndication.feed.synd.SyndEntry; -import com.sun.syndication.feed.synd.SyndFeed; -import com.sun.syndication.feed.synd.SyndPerson; -import com.sun.syndication.io.SyndFeedInput; - -/** - * - * @author dogacan - * @author mattmann - * @since NUTCH-444 - * - * <p> - * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced - * links and content present in the feed. - * </p> - * - */ -public class FeedParser implements Parser { - - public static final String CHARSET_UTF8 = "charset=UTF-8"; - - public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; " - + CHARSET_UTF8; - - public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class); - - private Configuration conf; - - private ParserFactory parserFactory; - - private URLNormalizers normalizers; - - private URLFilters filters; - - private String defaultEncoding; - - /** - * Parses the given feed and extracts out and parsers all linked items within - * the feed, using the underlying ROME feed parsing library. - * - * @param content - * A {@link Content} object representing the feed that is being - * parsed by this {@link Parser}. - * - * @return A {@link ParseResult} containing all {@link Parse}d feeds that were - * present in the feed file that this {@link Parser} dealt with. - * - */ - public ParseResult getParse(Content content) { - SyndFeed feed = null; - ParseResult parseResult = new ParseResult(content.getUrl()); - - EncodingDetector detector = new EncodingDetector(conf); - detector.autoDetectClues(content, true); - String encoding = detector.guessEncoding(content, defaultEncoding); - try { - InputSource input = new InputSource(new ByteArrayInputStream( - content.getContent())); - input.setEncoding(encoding); - SyndFeedInput feedInput = new SyndFeedInput(); - feed = feedInput.build(input); - } catch (Exception e) { - // return empty parse - LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " - + StringUtils.stringifyException(e)); - return new ParseStatus(e) - .getEmptyParseResult(content.getUrl(), getConf()); - } - - String feedLink = feed.getLink(); - try { - feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK); - if (feedLink != null) - feedLink = filters.filter(feedLink); - } catch (Exception e) { - feedLink = null; - } - - List<?> entries = feed.getEntries(); - for (Object entry : entries) { - addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content); - } - - String feedDesc = stripTags(feed.getDescriptionEx()); - String feedTitle = stripTags(feed.getTitleEx()); - - parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData( - new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], - content.getMetadata())); - - return parseResult; - } - - /** - * - * Sets the {@link Configuration} object for this {@link Parser}. This - * {@link Parser} expects the following configuration properties to be set: - * - * <ul> - * <li>URLNormalizers - properties in the configuration object to set up the - * default url normalizers.</li> - * <li>URLFilters - properties in the configuration object to set up the - * default url filters.</li> - * </ul> - * - * @param conf - * The Hadoop {@link Configuration} object to use to configure this - * {@link Parser}. - * - */ - public void setConf(Configuration conf) { - this.conf = conf; - this.parserFactory = new ParserFactory(conf); - this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK); - this.filters = new URLFilters(conf); - this.defaultEncoding = conf.get("parser.character.encoding.default", - "windows-1252"); - } - - /** - * - * @return The {@link Configuration} object used to configure this - * {@link Parser}. - */ - public Configuration getConf() { - return this.conf; - } - - /** - * Runs a command line version of this {@link Parser}. - * - * @param args - * A single argument (expected at arg[0]) representing a path on the - * local filesystem that points to a feed file. - * - * @throws Exception - * If any error occurs. - */ - public static void main(String[] args) throws Exception { - if (args.length != 1) { - System.err.println("Usage: FeedParser <feed>"); - System.exit(1); - } - String name = args[0]; - String url = "file:" + name; - Configuration conf = NutchConfiguration.create(); - FeedParser parser = new FeedParser(); - parser.setConf(conf); - File file = new File(name); - byte[] bytes = new byte[(int) file.length()]; - DataInputStream in = new DataInputStream(new FileInputStream(file)); - in.readFully(bytes); - ParseResult parseResult = parser.getParse(new Content(url, url, bytes, - "application/rss+xml", new Metadata(), conf)); - for (Entry<Text, Parse> entry : parseResult) { - System.out.println("key: " + entry.getKey()); - Parse parse = entry.getValue(); - System.out.println("data: " + parse.getData()); - System.out.println("text: " + parse.getText() + "\n"); - } - } - - private void addToMap(ParseResult parseResult, SyndFeed feed, - String feedLink, SyndEntry entry, Content content) { - String link = entry.getLink(), text = null, title = null; - Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata(); - Parse parse = null; - SyndContent description = entry.getDescription(); - - try { - link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK); - - if (link != null) - link = filters.filter(link); - } catch (Exception e) { - e.printStackTrace(); - return; - } - - if (link == null) - return; - - title = stripTags(entry.getTitleEx()); - - if (feedLink != null) - parseMeta.set("feed", feedLink); - - addFields(parseMeta, contentMeta, feed, entry); - - // some item descriptions contain markup text in them, - // so we temporarily set their content-type to parse them - // with another plugin - String contentType = contentMeta.get(Response.CONTENT_TYPE); - - if (description != null) - text = description.getValue(); - - if (text == null) { - List<?> contents = entry.getContents(); - StringBuilder buf = new StringBuilder(); - for (Object syndContent : contents) { - buf.append(((SyndContent) syndContent).getValue()); - } - text = buf.toString(); - } - - try { - Parser parser = parserFactory.getParsers(contentType, link)[0]; - parse = parser.getParse( - new Content(link, link, text.getBytes(), contentType, contentMeta, - conf)).get(link); - } catch (ParserNotFound e) { /* ignore */ - } - - if (parse != null) { - ParseData data = parse.getData(); - data.getContentMeta().remove(Response.CONTENT_TYPE); - mergeMetadata(data.getParseMeta(), parseMeta); - parseResult.put(link, new ParseText(parse.getText()), - new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), - data.getContentMeta(), data.getParseMeta())); - } else { - contentMeta.remove(Response.CONTENT_TYPE); - parseResult.put(link, new ParseText(text), new ParseData( - ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, - parseMeta)); - } - - } - - private static String stripTags(SyndContent c) { - if (c == null) - return ""; - - String value = c.getValue(); - - String[] parts = value.split("<[^>]*>"); - StringBuffer buf = new StringBuffer(); - - for (String part : parts) - buf.append(part); - - return buf.toString().trim(); - } - - private void addFields(Metadata parseMeta, Metadata contentMeta, - SyndFeed feed, SyndEntry entry) { - List<?> authors = entry.getAuthors(), categories = entry.getCategories(); - Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate(); - String contentType = null; - - if (authors != null) { - for (Object o : authors) { - SyndPerson author = (SyndPerson) o; - String authorName = author.getName(); - if (checkString(authorName)) { - parseMeta.add(Feed.FEED_AUTHOR, authorName); - } - } - } else { - // getAuthors may return null if feed is non-atom - // if so, call getAuthor to get Dublin Core module creator. - String authorName = entry.getAuthor(); - if (checkString(authorName)) { - parseMeta.set(Feed.FEED_AUTHOR, authorName); - } - } - - for (Object i : categories) { - parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName()); - } - - if (published != null) { - parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime())); - } - if (updated != null) { - parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime())); - } - - SyndContent description = entry.getDescription(); - if (description != null) { - contentType = description.getType(); - } else { - // TODO: What to do if contents.size() > 1? - List<?> contents = entry.getContents(); - if (contents.size() > 0) { - contentType = ((SyndContent) contents.get(0)).getType(); - } - } - - if (checkString(contentType)) { - // ROME may return content-type as html - if (contentType.equals("html")) - contentType = "text/html"; - else if (contentType.equals("xhtml")) - contentType = "text/xhtml"; - contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8); - } else { - contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE); - } - - } - - private void mergeMetadata(Metadata first, Metadata second) { - for (String name : second.names()) { - String[] values = second.getValues(name); - for (String value : values) { - first.add(name, value); - } - } - } - - private boolean checkString(String s) { - return s != null && !s.equals(""); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java deleted file mode 100644 index 3b15968..0000000 --- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse RSS feeds. - */ -package org.apache.nutch.parse.feed; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java deleted file mode 100644 index 36c8739..0000000 --- a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.feed; - -// JDK imports -import java.util.Iterator; -import java.util.Map; - -import org.junit.Assert; -import org.junit.Test; -// APACHE imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.ProtocolNotFound; -import org.apache.nutch.util.NutchConfiguration; - -/** - * - * @author mattmann - * - * Test Suite for the {@link FeedParser}. - * - */ -public class TestFeedParser { - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/feed/build.xml during plugin compilation. - - private String[] sampleFiles = { "rsstest.rss" }; - - public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class - .getName()); - - /** - * Calls the {@link FeedParser} on a sample RSS file and checks that there are - * 3 {@link ParseResult} entries including the below 2 links: - * <ul> - * <li>http://www-scf.usc.edu/~mattmann/</li> - * <li>http://www.nutch.org</li> - * </ul> - * - * - * @throws ProtocolNotFound - * If the {@link Protocol}Layer cannot be loaded (required to fetch - * the {@link Content} for the RSS file). - * @throws ParseException - * If the {@link Parser}Layer cannot be loaded. - */ - @Test - public void testParseFetchChannel() throws ProtocolNotFound, ParseException { - String urlString; - Protocol protocol; - Content content; - ParseResult parseResult; - - Configuration conf = NutchConfiguration.create(); - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - urlString = urlString.replace('\\', '/'); - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - - parseResult = new ParseUtil(conf).parseByExtensionId("feed", content); - - Assert.assertEquals(3, parseResult.size()); - - boolean hasLink1 = false, hasLink2 = false, hasLink3 = false; - - for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j - .hasNext();) { - Map.Entry<Text, Parse> entry = j.next(); - if (entry.getKey().toString() - .equals("http://www-scf.usc.edu/~mattmann/")) { - hasLink1 = true; - } else if (entry.getKey().toString().equals("http://www.nutch.org/")) { - hasLink2 = true; - } else if (entry.getKey().toString().equals(urlString)) { - hasLink3 = true; - } - - Assert.assertNotNull(entry.getValue()); - Assert.assertNotNull(entry.getValue().getData()); - } - - if (!hasLink1 || !hasLink2 || !hasLink3) { - Assert.fail("Outlinks read from sample rss file are not correct!"); - } - } - - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/headings/build.xml b/src/plugin/headings/build.xml deleted file mode 100644 index d334ad1..0000000 --- a/src/plugin/headings/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="headings" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/headings/ivy.xml b/src/plugin/headings/ivy.xml deleted file mode 100644 index 5b8393b..0000000 --- a/src/plugin/headings/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="${nutch.root}/ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/headings/plugin.xml b/src/plugin/headings/plugin.xml deleted file mode 100644 index 0d7921a..0000000 --- a/src/plugin/headings/plugin.xml +++ /dev/null @@ -1,45 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="headings" - name="Headings Parse Filter" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="headings.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.parse.headings" - name="Nutch Headings Parse Filter" - point="org.apache.nutch.parse.HtmlParseFilter"> - - <implementation id="HeadingsParseFilter" - class="org.apache.nutch.parse.headings.HeadingsParseFilter"> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java deleted file mode 100644 index 657f260..0000000 --- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.headings; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.*; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NodeWalker; -import org.w3c.dom.*; - -/** - * HtmlParseFilter to retrieve h1 and h2 values from the DOM. - */ -public class HeadingsParseFilter implements HtmlParseFilter { - - /** - * Pattern used to strip surpluss whitespace - */ - protected static Pattern whitespacePattern = Pattern.compile("\\s+"); - - private Configuration conf; - private String[] headings; - private boolean multiValued = false; - - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - Parse parse = parseResult.get(content.getUrl()); - - for (int i = 0; headings != null && i < headings.length; i++) { - List<String> discoveredHeadings = getElement(doc, headings[i]); - - if (discoveredHeadings.size() > 0) { - for (String heading : discoveredHeadings) { - if (heading != null) { - heading.trim(); - - if (heading.length() > 0) { - parse.getData().getParseMeta().add(headings[i], heading); - } - } - } - } - } - - return parseResult; - } - - public void setConf(Configuration conf) { - this.conf = conf; - - headings = conf.getStrings("headings"); - multiValued = conf.getBoolean("headings.multivalued", false); - } - - public Configuration getConf() { - return this.conf; - } - - /** - * Finds the specified element and returns its value - */ - protected List<String> getElement(DocumentFragment doc, String element) { - List<String> headings = new ArrayList<String>(); - NodeWalker walker = new NodeWalker(doc); - - while (walker.hasNext()) { - Node currentNode = walker.nextNode(); - - if (currentNode.getNodeType() == Node.ELEMENT_NODE) { - if (element.equalsIgnoreCase(currentNode.getNodeName())) { - headings.add(getNodeValue(currentNode)); - - // Check for multiValued here, if disabled we don't need - // to discover more headings. - if (!multiValued) { - break; - } - } - } - } - - return headings; - } - - /** - * Returns the text value of the specified Node and child nodes - */ - protected static String getNodeValue(Node node) { - StringBuilder buffer = new StringBuilder(); - - NodeList children = node.getChildNodes(); - - for (int i = 0; i < children.getLength(); i++) { - if (children.item(i).getNodeType() == Node.TEXT_NODE) { - buffer.append(children.item(i).getNodeValue()); - } - } - - // Return with stripped surplus whitespace - Matcher matcher = whitespacePattern.matcher(buffer.toString().trim()); - return matcher.replaceAll(" ").trim(); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java deleted file mode 100644 index 363e0b2..0000000 --- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree. - */ -package org.apache.nutch.parse.headings; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/build.xml b/src/plugin/index-anchor/build.xml deleted file mode 100644 index 597b532..0000000 --- a/src/plugin/index-anchor/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-anchor" default="jar-core"> - - <import file="../build-plugin.xml" /> - -</project> \ No newline at end of file
