Author: lewismc
Date: Thu Mar 13 12:51:12 2014
New Revision: 1577143
URL: http://svn.apache.org/r1577143
Log:
NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series
Added:
nutch/branches/2.x/src/plugin/index-metadata/
nutch/branches/2.x/src/plugin/index-metadata/build.xml
nutch/branches/2.x/src/plugin/index-metadata/ivy.xml
nutch/branches/2.x/src/plugin/index-metadata/plugin.xml
nutch/branches/2.x/src/plugin/index-metadata/src/
nutch/branches/2.x/src/plugin/index-metadata/src/java/
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
nutch/branches/2.x/src/plugin/parse-metatags/
nutch/branches/2.x/src/plugin/parse-metatags/README.txt
nutch/branches/2.x/src/plugin/parse-metatags/build.xml
nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml
nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml
nutch/branches/2.x/src/plugin/parse-metatags/sample/
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
nutch/branches/2.x/src/plugin/parse-metatags/src/
nutch/branches/2.x/src/plugin/parse-metatags/src/java/
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
nutch/branches/2.x/src/plugin/parse-metatags/src/test/
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/build.xml
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/conf/schema.xml
nutch/branches/2.x/src/plugin/build.xml
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Mar 13 12:51:12 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series
(kiran, Nguyen Manh Tien, Talat UYARER, Vangelis Karvounis via lewismc)
+
* NUTCH-1729 Upgrade to Tika 1.5 (jnioche)
* NUTCH-1721 Upgrade to Crawler commons 0.3 (tejasp)
Modified: nutch/branches/2.x/build.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Thu Mar 13 12:51:12 2014
@@ -151,9 +151,11 @@
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
<packageset dir="${plugins.dir}/feed/src/java"/>
+ <packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
+ <packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
@@ -161,6 +163,7 @@
<packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
<packageset dir="${plugins.dir}/parse-ext/src/java"/>
<packageset dir="${plugins.dir}/parse-html/src/java"/>
+ <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
<packageset dir="${plugins.dir}/parse-js/src/java"/>
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
@@ -576,6 +579,7 @@
<packageset dir="${plugins.dir}/parse-ext/src/java"/>
<packageset dir="${plugins.dir}/parse-html/src/java"/>
<packageset dir="${plugins.dir}/parse-js/src/java"/>
+ <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
@@ -937,6 +941,7 @@
<source path="${basedir}/src/plugin/index-basic/src/test/" />
<source path="${basedir}/src/plugin/index-more/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/test/" />
+ <source path="${basedir}/src/plugin/index-metadata/src/java/" />
<source path="${basedir}/src/plugin/language-identifier/src/java/" />
<source path="${basedir}/src/plugin/language-identifier/src/test/" />
<source path="${basedir}/src/plugin/lib-http/src/java/" />
@@ -952,6 +957,8 @@
<source path="${basedir}/src/plugin/parse-html/src/test/" />
<source path="${basedir}/src/plugin/parse-js/src/java/" />
<source path="${basedir}/src/plugin/parse-js/src/test/" />
+ <source path="${basedir}/src/plugin/parse-metatags/src/java/" />
+ <source path="${basedir}/src/plugin/parse-metatags/src/test/" />
<!-- parse-swf and parse-zip are currently disabled
<source path="${basedir}/src/plugin/parse-swf/src/java/" />
<source path="${basedir}/src/plugin/parse-swf/src/test/" />
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Thu Mar 13 12:51:12 2014
@@ -1096,6 +1096,30 @@
</description>
</property>
+<!-- index-metadata plugin properties -->
+
+<property>
+ <name>index.metadata</name>
+ <value>description,keywords</value>
+ <description>
+ Comma-separated list of keys to be taken from the metadata to generate
fields.
+ Can be used e.g. for 'description' or 'keywords' provided that these values
are generated
+ by a parser (see parse-metatags plugin)
+ </description>
+</property>
+
+<!-- parse-metatags plugin properties -->
+<property>
+ <name>metatags.names</name>
+ <value>*</value>
+ <description> Names of the metatags to extract, separated by ';'.
+ Use '*' to extract all metatags. Prefixes the names with 'meta_'
+ in the parse-metadata. For instance to index description and keywords,
+ you need to activate the plugin index-metadata and set the value of the
+ parameter 'index.metadata' to 'meta_description;meta_keywords'.
+ </description>
+</property>
+
<!-- Temporary Hadoop 0.17.x workaround. -->
<property>
Modified: nutch/branches/2.x/conf/schema.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Thu Mar 13 12:51:12 2014
@@ -336,6 +336,9 @@
<field name="contentLength" type="string" stored="true" indexed="false"/>
<field name="lastModified" type="date" stored="true" indexed="false"/>
<field name="date" type="tdate" stored="true" indexed="true"/>
+
+ <!-- fields for index-metadata plugin -->
+ <dynamicField name="meta_*" type="string" stored="true" indexed="true"/>
<!-- fields for languageidentifier plugin -->
<field name="lang" type="string" stored="true" indexed="true"/>
Modified: nutch/branches/2.x/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Thu Mar 13 12:51:12 2014
@@ -30,6 +30,7 @@
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-more" target="deploy"/>
+ <ant dir="index-metadata" target="deploy"/>
<ant dir="indexer-solr" target="deploy"/>
<ant dir="indexer-elastic" target="deploy"/>
<ant dir="language-identifier" target="deploy"/>
@@ -47,6 +48,7 @@
<ant dir="parse-js" target="deploy"/>
<ant dir="parse-html" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
+ <ant dir="parse-metatags" target="deploy"/>
<ant dir="scoring-link" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
@@ -77,6 +79,7 @@
<ant dir="protocol-file" target="test"/>
<ant dir="parse-html" target="test"/>
<ant dir="parse-js" target="test"/>
+ <ant dir="parse-metatags" target="test"/>
<ant dir="index-anchor" target="test"/>
<ant dir="index-basic" target="test"/>
<ant dir="index-more" target="test"/>
@@ -113,6 +116,7 @@
<ant dir="index-anchor" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-more" target="clean"/>
+ <ant dir="index-metadata" target="clean"/>
<ant dir="indexer-solr" target="clean"/>
<ant dir="indexer-elastic" target="clean"/>
<ant dir="language-identifier" target="clean"/>
@@ -128,6 +132,7 @@
<ant dir="parse-swf" target="clean"/>
<ant dir="parse-tika" target="clean"/>
<ant dir="parse-zip" target="clean"/>
+ <ant dir="parse-metatags" target="clean"/>
<ant dir="protocol-file" target="clean"/>
<ant dir="protocol-ftp" target="clean"/>
<ant dir="protocol-httpclient" target="clean"/>
Added: nutch/branches/2.x/src/plugin/index-metadata/build.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/build.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/build.xml (added)
+++ nutch/branches/2.x/src/plugin/index-metadata/build.xml Thu Mar 13 12:51:12
2014
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-metadata" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: nutch/branches/2.x/src/plugin/index-metadata/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/ivy.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/ivy.xml (added)
+++ nutch/branches/2.x/src/plugin/index-metadata/ivy.xml Thu Mar 13 12:51:12
2014
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/branches/2.x/src/plugin/index-metadata/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/plugin.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/plugin.xml (added)
+++ nutch/branches/2.x/src/plugin/index-metadata/plugin.xml Thu Mar 13 12:51:12
2014
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-metadata"
+ name="Metadata Indexing Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="index-metadata.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+
+ <extension id="org.apache.nutch.indexer.metadata"
+ name="Nutch Metadata Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="MetadataIndexer"
+
class="org.apache.nutch.indexer.metadata.MetadataIndexer"/>
+ </extension>
+
+</plugin>
Added:
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1577143&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
(added)
+++
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
Thu Mar 13 12:51:12 2014
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.metadata;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+
+/**
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db",
+ * "index.parse" or "index.content" who's values are comma-delimited
+ * <value>key1, key2, key3</value>.
+ */
+
+public class MetadataIndexer implements IndexingFilter {
+ private Configuration conf;
+ private static String[] parseFieldnames;
+ private static final String PARSE_CONF_PROPERTY = "index.metadata";
+ private static final String INDEX_PREFIX = "meta_";
+ private static final String PARSE_META_PREFIX = "meta_";
+
+ public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+ throws IndexingException {
+
+ // just in case
+ if (doc == null)
+ return doc;
+
+ // add the fields from parsemd
+ if (parseFieldnames != null) {
+ for (String metatag : parseFieldnames) {
+ ByteBuffer bvalues = page.getFromMetadata(new Utf8(PARSE_META_PREFIX
+ + metatag));
+ if (bvalues != null) {
+ String value = new String(bvalues.array());
+ String[] values = value.split("\t");
+ for (String eachvalue : values) {
+ doc.add(INDEX_PREFIX + metatag, eachvalue);
+ }
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ parseFieldnames = conf.getStrings(PARSE_CONF_PROPERTY);
+ // TODO check conflict between field names e.g. could have same label
+ // from different sources
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return null;
+ }
+}
Added: nutch/branches/2.x/src/plugin/parse-metatags/README.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/README.txt?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/README.txt (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/README.txt Thu Mar 13 12:51:12
2014
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Parse-metatags plugin
+
+The parse-metatags plugin consists of a HTMLParserFilter which takes as
+parameter a list of metatag names with '*' as default value. The values
+are separated by ';'.
+In order to extract the values of the metatags description and keywords,
+you must specify in nutch-site.xml
+
+<property>
+ <name>metatags.names</name>
+ <value>description;keywords</value>
+</property>
+
+Prefixes the names with 'metatag.' in the parse-metadata. For instance to
+index description and keywords, you need to activate the plugin index-metadata
+and set the value of the parameter 'index.metadata' to
'metatag.description;metatag.keywords'.
+
+This code has been developed by DigitalPebble Ltd and offered to the community
by ANT.com
+
+
+
+
Added: nutch/branches/2.x/src/plugin/parse-metatags/build.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/build.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/build.xml (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/build.xml Thu Mar 13 12:51:12
2014
@@ -0,0 +1,53 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-metatags" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
+ <ant target="jar" inheritall="false" dir="../parse-html"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-nekohtml/*.jar" />
+ <include name="**/parse-html/*.jar" />
+ </fileset>
+ </path>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
+ <ant target="deploy" inheritall="false" dir="../parse-html"/>
+ <ant target="deploy" inheritall="false"
dir="../nutch-extensionpoints" />
+ <ant target="deploy" inheritall="false" dir="../protocol-file"
/>
+ </target>
+
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.html"/>
+ </fileset>
+ </copy>
+
+</project>
Added: nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml Thu Mar 13 12:51:12
2014
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml Thu Mar 13 12:51:12
2014
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-metatags"
+ name="MetaTags"
+ version="1.0"
+ provider-name="digitalpebble.com">
+
+ <runtime>
+ <library name="parse-metatags.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.parse.metatags.parser"
+ name="MetaTags Parser"
+ point="org.apache.nutch.parse.ParseFilter">
+ <implementation id="MetaTagsParser"
+ class="org.apache.nutch.parse.MetaTagsParser"/>
+ </extension>
+
+</plugin>
+
Added: nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html
(added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html Thu
Mar 13 12:51:12 2014
@@ -0,0 +1,11 @@
+<html>
+<head>
+<meta name="Keywords" content="This is a test of keywords" />
+<meta name="Description" content="This is a test of description" />
+<meta name="Creator" content="Author1" />
+<meta name="Creator" content="Author2" />
+</head>
+<body>
+text of the document
+</body>
+
Added:
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html?rev=1577143&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
(added)
+++
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
Thu Mar 13 12:51:12 2014
@@ -0,0 +1,13 @@
+<html>
+<head>
+<meta name="DC.creator" content="Doug Cutting">
+<meta name="DC.creator" content="Michael Cafarella">
+<!-- meta keywords in different casing -->
+<meta name="keywords" lang="en" content="web crawler" />
+<meta name="Keywords" lang="fr" content="robot d'indexation" />
+<meta name="KEYWORDS" lang="de" content="Webcrawler" />
+</head>
+<body>
+A test for multi-valued metatags.
+</body>
+</html>
\ No newline at end of file
Added:
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java?rev=1577143&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
(added)
+++
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
Thu Mar 13 12:51:12 2014
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.avro.util.Utf8;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Parse HTML meta tags (keywords, description) and store them in the parse
+ * metadata so that they can be indexed with the index-metadata plugin with the
+ * prefix 'metatag.'
+ ***/
+
+public class MetaTagsParser implements ParseFilter {
+
+ private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
+ .getName());
+
+ private Configuration conf;
+
+ public static final String PARSE_META_PREFIX = "meta_";
+
+ private Set<String> metatagset = new HashSet<String>();
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ // specify whether we want a specific subset of metadata
+ // by default take everything we can find
+ String metatags = conf.get("metatags.names", "*");
+ String[] values = metatags.split(";");
+ for (String val : values)
+ metatagset.add(val.toLowerCase());
+ if(metatagset.size()==0){
+ metatagset.add("*");
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public Parse filter(String url, WebPage page, Parse parse,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+
+ Map<Utf8, ByteBuffer> metadata = new HashMap<Utf8, ByteBuffer>();
+
+ // check in the metadata first : the tika-parser
+ // might have stored the values there already
+ Iterator<Entry<Utf8, ByteBuffer>> iterator = page.getMetadata().entrySet()
+ .iterator();
+ while (iterator.hasNext()) {
+ Entry<Utf8, ByteBuffer> entry = iterator.next();
+ String mdName = entry.getKey().toString();
+ String value = Bytes.toStringBinary(entry.getValue());
+ if (metatagset.contains("*") ||
metatagset.contains(mdName.toLowerCase())) {
+ // now add the metadata
+ LOG.debug("Found meta tag: '" + mdName + "', with value: '" + value
+ + "'");
+ metadata.put(new Utf8(PARSE_META_PREFIX + mdName.toLowerCase()),
+ ByteBuffer.wrap(value.getBytes()));
+ }
+ }
+ Iterator<Entry<Utf8, ByteBuffer>> itm = metadata.entrySet().iterator();
+ while (iterator.hasNext()) {
+ Entry<Utf8, ByteBuffer> entry = itm.next();
+ page.putToMetadata(entry.getKey(), entry.getValue());
+ }
+
+ Properties generalMetaTags = metaTags.getGeneralTags();
+ Iterator<Object> it = generalMetaTags.keySet().iterator();
+ while (it.hasNext()) {
+ StringBuilder sb = new StringBuilder();
+ String name = (String) it.next();
+ String[] values = new String[] { (String) generalMetaTags.get(name) };
+ // The multivalues of a metadata field are saved with a separator '\t'
+ // in the storage
+ // unless there is only one entry, where no \t is appended.
+ for (String value : values) {
+ if (values.length > 1) {
+ sb.append(value).append("\t");
+ } else {
+ sb.append(value);
+ }
+ }
+ // check whether the name is in the list of what we want or if
+ // specified *
+ if (metatagset.contains("*") || metatagset.contains(name.toLowerCase()))
{
+ // Add the recently parsed value of multiValued array to metadata
+ LOG.debug("Found meta tag : " + name + "\t" + sb.toString());
+ page.putToMetadata(new Utf8(PARSE_META_PREFIX + name.toLowerCase()),
+ ByteBuffer.wrap(Bytes.toBytes(sb.toString())));
+ }
+ }
+
+ Properties httpequiv = metaTags.getHttpEquivTags();
+ Enumeration<?> tagNames = httpequiv.propertyNames();
+ while (tagNames.hasMoreElements()) {
+ String name = (String) tagNames.nextElement();
+ String value = httpequiv.getProperty(name);
+ // check whether the name is in the list of what we want or if
+ // specified *
+ if (metatagset.contains("*") || metatagset.contains(name.toLowerCase()))
{
+ LOG.debug("Found meta tag : " + name + "\t" + value);
+ page.putToMetadata(new Utf8(PARSE_META_PREFIX + name.toLowerCase()),
+ ByteBuffer.wrap(value.getBytes()));
+ }
+ }
+
+ return parse;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return null;
+ }
+
+}
Added:
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java?rev=1577143&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
(added)
+++
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
Thu Mar 13 12:51:12 2014
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
+import org.apache.nutch.util.NutchConfiguration;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Test;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Map;
+
+public class TestMetaTagsParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ private String sampleDir = System.getProperty("test.data", ".");
+ private String sampleFile = "testMetatags.html";
+ private String description = "This is a test of description";
+ private String keywords = "This is a test of keywords";
+
+ /**
+ *
+ * @param fileName
+ * This variable set test file.
+ * @param useUtil
+ * If value is True method use ParseUtil
+ * @return If successfully document parsed, it return metatags
+ */
+ public Map<Utf8, ByteBuffer> parseMetaTags(String fileName, boolean useUtil)
{
+ try {
+ Configuration conf = NutchConfiguration.create();
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+
+ File file = new File(sampleDir + fileSeparator + fileName);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ in.close();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ page.setContentType(new Utf8("text/html"));
+
+ if (useUtil) {
+ ParseUtil parser = new ParseUtil(conf);
+ parser.parse(urlString, page);
+ } else {
+ DocumentFragment node = getDOMDocument(bytes);
+ HTMLMetaTags metaTags = new HTMLMetaTags();
+ URL base = null;
+ try {
+ base = new URL(urlString);
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ }
+ // get meta directives
+ getMetaTags(metaTags, node, base);
+
+ MetaTagsParser mtp = new MetaTagsParser();
+ mtp.setConf(conf);
+ mtp.filter(urlString, page, new Parse(), metaTags, node);
+ }
+
+ return page.getMetadata();
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.toString());
+ return null;
+ }
+ }
+
+ public static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+ URL currURL) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ if ("body".equalsIgnoreCase(node.getNodeName())) {
+ // META tags should not be under body
+ return;
+ }
+
+ if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node nameNode = null;
+ Node equivNode = null;
+ Node contentNode = null;
+ // Retrieves name, http-equiv and content attribues
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName().toLowerCase();
+ if (attrName.equals("name")) {
+ nameNode = attr;
+ } else if (attrName.equals("http-equiv")) {
+ equivNode = attr;
+ } else if (attrName.equals("content")) {
+ contentNode = attr;
+ }
+ }
+ if (nameNode != null) {
+ if (contentNode != null) {
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().setProperty(name,
+ contentNode.getNodeValue());
+ }
+ }
+
+ if (equivNode != null) {
+ if (contentNode != null) {
+ String name = equivNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue();
+ metaTags.getHttpEquivTags().setProperty(name, content);
+ }
+ }
+ }
+ }
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ getMetaTagsHelper(metaTags, children.item(i), currURL);
+ }
+ }
+ }
+
+ public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+ URL currURL) {
+
+ metaTags.reset();
+ getMetaTagsHelper(metaTags, node, currURL);
+ }
+
+ private static DocumentFragment getDOMDocument(byte[] content)
+ throws IOException, SAXException {
+ InputSource input = new InputSource(new ByteArrayInputStream(content));
+ input.setEncoding("utf-8");
+ DOMFragmentParser parser = new DOMFragmentParser();
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+ parser.parse(input, node);
+ return node;
+ }
+
+ /**
+ * This test use parse-html with other parse filters.
+ */
+ @Test
+ public void testMetaTagsParserWithConf() {
+ // check that we get the same values
+ Map<Utf8, ByteBuffer> meta = parseMetaTags(sampleFile, true);
+
+ assertEquals(description,
+ getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "description"));
+ assertEquals(keywords,
+ getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "keywords"));
+ }
+
+ /**
+ * This test generate custom DOM tree without parse-html for testing just
+ * parse-metatags.
+ */
+ @Test
+ public void testFilter() {
+ // check that we get the same values
+ Map<Utf8, ByteBuffer> meta = parseMetaTags(sampleFile, false);
+
+ assertEquals(description,
+ getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "description"));
+ assertEquals(keywords,
+ getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "keywords"));
+ }
+
+ private String getMeta(Map<Utf8, ByteBuffer> meta, String name) {
+ ByteBuffer raw = meta.get(new Utf8(name));
+ return Bytes.toString(raw);
+ }
+
+}
Modified:
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
---
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
(original)
+++
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Thu Mar 13 12:51:12 2014
@@ -92,8 +92,6 @@ public class TestIndexingFilters {
// set content metadata
Metadata md = new Metadata();
md.add("example","data");
- // set content metadata property defined in MetadataIndexer
- conf.set("index.content.md","example");
// add MetadataIndxer filter
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters2 = new IndexingFilters(conf);