Author: lewismc
Date: Thu Mar 13 12:51:12 2014
New Revision: 1577143

URL: http://svn.apache.org/r1577143
Log:
NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series

Added:
    nutch/branches/2.x/src/plugin/index-metadata/
    nutch/branches/2.x/src/plugin/index-metadata/build.xml
    nutch/branches/2.x/src/plugin/index-metadata/ivy.xml
    nutch/branches/2.x/src/plugin/index-metadata/plugin.xml
    nutch/branches/2.x/src/plugin/index-metadata/src/
    nutch/branches/2.x/src/plugin/index-metadata/src/java/
    nutch/branches/2.x/src/plugin/index-metadata/src/java/org/
    nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/
    nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/
    
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/
    
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/
    
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
    nutch/branches/2.x/src/plugin/parse-metatags/
    nutch/branches/2.x/src/plugin/parse-metatags/README.txt
    nutch/branches/2.x/src/plugin/parse-metatags/build.xml
    nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml
    nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml
    nutch/branches/2.x/src/plugin/parse-metatags/sample/
    nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html
    
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
    nutch/branches/2.x/src/plugin/parse-metatags/src/
    nutch/branches/2.x/src/plugin/parse-metatags/src/java/
    nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/
    nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/
    nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/
    
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/
    
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
    nutch/branches/2.x/src/plugin/parse-metatags/src/test/
    nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/
    nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/
    nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/
    
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/
    
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/build.xml
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/conf/schema.xml
    nutch/branches/2.x/src/plugin/build.xml
    
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Mar 13 12:51:12 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series 
(kiran, Nguyen Manh Tien, Talat UYARER, Vangelis Karvounis via lewismc)
+
 * NUTCH-1729 Upgrade to Tika 1.5 (jnioche)
 
 * NUTCH-1721 Upgrade to Crawler commons 0.3 (tejasp)

Modified: nutch/branches/2.x/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Thu Mar 13 12:51:12 2014
@@ -151,9 +151,11 @@
    <packageset dir="${src.dir}"/>
       <packageset dir="${plugins.dir}/creativecommons/src/java"/>
       <packageset dir="${plugins.dir}/feed/src/java"/>
+         <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
+      <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
@@ -161,6 +163,7 @@
       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
       <packageset dir="${plugins.dir}/parse-html/src/java"/>
+      <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
       <packageset dir="${plugins.dir}/parse-js/src/java"/>
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
@@ -576,6 +579,7 @@
       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
       <packageset dir="${plugins.dir}/parse-html/src/java"/>
       <packageset dir="${plugins.dir}/parse-js/src/java"/>
+      <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
@@ -937,6 +941,7 @@
         <source path="${basedir}/src/plugin/index-basic/src/test/" />
         <source path="${basedir}/src/plugin/index-more/src/java/" />
         <source path="${basedir}/src/plugin/index-more/src/test/" />
+        <source path="${basedir}/src/plugin/index-metadata/src/java/" />
         <source path="${basedir}/src/plugin/language-identifier/src/java/" />
         <source path="${basedir}/src/plugin/language-identifier/src/test/" />
         <source path="${basedir}/src/plugin/lib-http/src/java/" />
@@ -952,6 +957,8 @@
         <source path="${basedir}/src/plugin/parse-html/src/test/" />
         <source path="${basedir}/src/plugin/parse-js/src/java/" />
         <source path="${basedir}/src/plugin/parse-js/src/test/" />
+        <source path="${basedir}/src/plugin/parse-metatags/src/java/" />
+        <source path="${basedir}/src/plugin/parse-metatags/src/test/" />
         <!-- parse-swf and parse-zip are currently disabled 
         <source path="${basedir}/src/plugin/parse-swf/src/java/" />
         <source path="${basedir}/src/plugin/parse-swf/src/test/" />

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Thu Mar 13 12:51:12 2014
@@ -1096,6 +1096,30 @@
   </description>
 </property>
 
+<!-- index-metadata plugin properties -->
+
+<property>
+  <name>index.metadata</name>
+  <value>description,keywords</value>
+  <description>
+  Comma-separated list of keys to be taken from the metadata to generate 
fields.
+  Can be used e.g. for 'description' or 'keywords' provided that these values 
are generated
+  by a parser (see parse-metatags plugin)  
+  </description>
+</property>
+
+<!-- parse-metatags plugin properties -->
+<property>
+  <name>metatags.names</name>
+  <value>*</value>
+  <description> Names of the metatags to extract, separated by ';'. 
+  Use '*' to extract all metatags. Prefixes the names with 'meta_'
+  in the parse-metadata. For instance to index description and keywords, 
+  you need to activate the plugin index-metadata and set the value of the 
+  parameter 'index.metadata' to 'meta_description;meta_keywords'.
+  </description>
+</property>
+
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Thu Mar 13 12:51:12 2014
@@ -336,6 +336,9 @@
     <field name="contentLength" type="string" stored="true" indexed="false"/>
     <field name="lastModified" type="date" stored="true" indexed="false"/>
     <field name="date" type="tdate" stored="true" indexed="true"/>
+    
+    <!-- fields for index-metadata plugin -->  
+    <dynamicField name="meta_*" type="string" stored="true" indexed="true"/>
 
     <!-- fields for languageidentifier plugin -->
     <field name="lang" type="string" stored="true" indexed="true"/>

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Thu Mar 13 12:51:12 2014
@@ -30,6 +30,7 @@
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
+     <ant dir="index-metadata" target="deploy"/>
      <ant dir="indexer-solr" target="deploy"/>
      <ant dir="indexer-elastic" target="deploy"/>
      <ant dir="language-identifier" target="deploy"/>
@@ -47,6 +48,7 @@
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
+     <ant dir="parse-metatags" target="deploy"/>
      <ant dir="scoring-link" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
@@ -77,6 +79,7 @@
      <ant dir="protocol-file" target="test"/>
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-js" target="test"/>
+     <ant dir="parse-metatags" target="test"/>
      <ant dir="index-anchor" target="test"/>
      <ant dir="index-basic" target="test"/>
      <ant dir="index-more" target="test"/>
@@ -113,6 +116,7 @@
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-more" target="clean"/>
+    <ant dir="index-metadata" target="clean"/>
     <ant dir="indexer-solr" target="clean"/>
     <ant dir="indexer-elastic" target="clean"/>
     <ant dir="language-identifier" target="clean"/>
@@ -128,6 +132,7 @@
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
+    <ant dir="parse-metatags" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>

Added: nutch/branches/2.x/src/plugin/index-metadata/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/build.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/build.xml (added)
+++ nutch/branches/2.x/src/plugin/index-metadata/build.xml Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-metadata" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: nutch/branches/2.x/src/plugin/index-metadata/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/ivy.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/ivy.xml (added)
+++ nutch/branches/2.x/src/plugin/index-metadata/ivy.xml Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/branches/2.x/src/plugin/index-metadata/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/plugin.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/plugin.xml (added)
+++ nutch/branches/2.x/src/plugin/index-metadata/plugin.xml Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-metadata"
+   name="Metadata Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="index-metadata.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+
+   <extension id="org.apache.nutch.indexer.metadata"
+              name="Nutch Metadata Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="MetadataIndexer"
+                      
class="org.apache.nutch.indexer.metadata.MetadataIndexer"/>
+   </extension>
+
+</plugin>

Added: 
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1577143&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
 Thu Mar 13 12:51:12 2014
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.metadata;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+
+/**
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db",
+ * "index.parse" or "index.content" who's values are comma-delimited
+ * <value>key1, key2, key3</value>.
+ */
+
+public class MetadataIndexer implements IndexingFilter {
+  private Configuration conf;
+  private static String[] parseFieldnames;
+  private static final String PARSE_CONF_PROPERTY = "index.metadata";
+  private static final String INDEX_PREFIX = "meta_";
+  private static final String PARSE_META_PREFIX = "meta_";
+
+  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+      throws IndexingException {
+
+    // just in case
+    if (doc == null)
+      return doc;
+
+    // add the fields from parsemd
+    if (parseFieldnames != null) {
+      for (String metatag : parseFieldnames) {
+        ByteBuffer bvalues = page.getFromMetadata(new Utf8(PARSE_META_PREFIX
+            + metatag));
+        if (bvalues != null) {
+          String value = new String(bvalues.array());
+          String[] values = value.split("\t");
+          for (String eachvalue : values) {
+            doc.add(INDEX_PREFIX + metatag, eachvalue);
+          }
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    parseFieldnames = conf.getStrings(PARSE_CONF_PROPERTY);
+    // TODO check conflict between field names e.g. could have same label
+    // from different sources
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public Collection<Field> getFields() {
+    return null;
+  }
+}

Added: nutch/branches/2.x/src/plugin/parse-metatags/README.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/README.txt?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/README.txt (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/README.txt Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Parse-metatags plugin
+
+The parse-metatags plugin consists of a HTMLParserFilter which takes as 
+parameter a list of metatag names with '*' as default value. The values 
+are separated by ';'.
+In order to extract the values of the metatags description and keywords, 
+you must specify in nutch-site.xml
+
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+</property>
+
+Prefixes the names with 'metatag.' in the parse-metadata. For instance to 
+index description and keywords, you need to activate the plugin index-metadata 
+and set the value of the parameter 'index.metadata' to 
'metatag.description;metatag.keywords'.
+  
+This code has been developed by DigitalPebble Ltd and offered to the community 
by ANT.com
+
+
+
+

Added: nutch/branches/2.x/src/plugin/parse-metatags/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/build.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/build.xml (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/build.xml Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,53 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-metatags" default="jar-core">
+
+       <import file="../build-plugin.xml" />
+
+       <!-- Build compilation dependencies -->
+       <target name="deps-jar">
+               <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
+               <ant target="jar" inheritall="false" dir="../parse-html"/>
+       </target>
+
+       <!-- Add compilation dependencies to classpath -->
+       <path id="plugin.deps">
+               <fileset dir="${nutch.root}/build">
+                       <include name="**/lib-nekohtml/*.jar" />
+                       <include name="**/parse-html/*.jar" />
+               </fileset>
+       </path>
+
+       <!-- Deploy Unit test dependencies -->
+       <target name="deps-test">
+               <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
+               <ant target="deploy" inheritall="false" dir="../parse-html"/>
+               <ant target="deploy" inheritall="false" 
dir="../nutch-extensionpoints" />
+               <ant target="deploy" inheritall="false" dir="../protocol-file" 
/>
+       </target>
+
+
+       <!-- for junit test -->
+       <mkdir dir="${build.test}/data" />
+       <copy todir="${build.test}/data">
+               <fileset dir="sample">
+                       <include name="*.html"/>
+               </fileset>
+       </copy>
+
+</project>

Added: nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/ivy.xml Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml (added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/plugin.xml Thu Mar 13 12:51:12 
2014
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-metatags"
+   name="MetaTags"
+   version="1.0"
+   provider-name="digitalpebble.com">
+
+   <runtime>
+      <library name="parse-metatags.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.metatags.parser"
+              name="MetaTags Parser"
+              point="org.apache.nutch.parse.ParseFilter">
+      <implementation id="MetaTagsParser"
+                      class="org.apache.nutch.parse.MetaTagsParser"/>
+   </extension>
+
+</plugin>
+

Added: nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html?rev=1577143&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html 
(added)
+++ nutch/branches/2.x/src/plugin/parse-metatags/sample/testMetatags.html Thu 
Mar 13 12:51:12 2014
@@ -0,0 +1,11 @@
+<html>
+<head>
+<meta name="Keywords" content="This is a test of keywords" />
+<meta name="Description" content="This is a test of description" />
+<meta name="Creator" content="Author1" />
+<meta name="Creator" content="Author2" />
+</head>
+<body>
+text of the document
+</body>
+

Added: 
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html?rev=1577143&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html 
(added)
+++ 
nutch/branches/2.x/src/plugin/parse-metatags/sample/testMultivalueMetatags.html 
Thu Mar 13 12:51:12 2014
@@ -0,0 +1,13 @@
+<html>
+<head>
+<meta name="DC.creator" content="Doug Cutting">
+<meta name="DC.creator" content="Michael Cafarella">
+<!-- meta keywords in different casing -->
+<meta name="keywords" lang="en" content="web crawler" />
+<meta name="Keywords" lang="fr" content="robot d'indexation" />
+<meta name="KEYWORDS" lang="de" content="Webcrawler" />
+</head>
+<body>
+A test for multi-valued metatags.
+</body>
+</html>
\ No newline at end of file

Added: 
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java?rev=1577143&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
 Thu Mar 13 12:51:12 2014
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.avro.util.Utf8;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Parse HTML meta tags (keywords, description) and store them in the parse
+ * metadata so that they can be indexed with the index-metadata plugin with the
+ * prefix 'metatag.'
+ ***/
+
+public class MetaTagsParser implements ParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
+      .getName());
+
+  private Configuration conf;
+
+  public static final String PARSE_META_PREFIX = "meta_";
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    // specify whether we want a specific subset of metadata
+    // by default take everything we can find
+    String metatags = conf.get("metatags.names", "*");
+    String[] values = metatags.split(";");
+    for (String val : values)
+      metatagset.add(val.toLowerCase());
+    if(metatagset.size()==0){
+      metatagset.add("*");
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public Parse filter(String url, WebPage page, Parse parse,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Map<Utf8, ByteBuffer> metadata = new HashMap<Utf8, ByteBuffer>();
+
+    // check in the metadata first : the tika-parser
+    // might have stored the values there already
+    Iterator<Entry<Utf8, ByteBuffer>> iterator = page.getMetadata().entrySet()
+        .iterator();
+    while (iterator.hasNext()) {
+      Entry<Utf8, ByteBuffer> entry = iterator.next();
+      String mdName = entry.getKey().toString();
+      String value = Bytes.toStringBinary(entry.getValue());
+      if (metatagset.contains("*") || 
metatagset.contains(mdName.toLowerCase())) {
+        // now add the metadata
+        LOG.debug("Found meta tag: '" + mdName + "', with value: '" + value
+            + "'");
+        metadata.put(new Utf8(PARSE_META_PREFIX + mdName.toLowerCase()),
+            ByteBuffer.wrap(value.getBytes()));
+      }
+    }
+    Iterator<Entry<Utf8, ByteBuffer>> itm = metadata.entrySet().iterator();
+    while (iterator.hasNext()) {
+      Entry<Utf8, ByteBuffer> entry = itm.next();
+      page.putToMetadata(entry.getKey(), entry.getValue());
+    }
+
+    Properties generalMetaTags = metaTags.getGeneralTags();
+    Iterator<Object> it = generalMetaTags.keySet().iterator();
+    while (it.hasNext()) {
+      StringBuilder sb = new StringBuilder();
+      String name = (String) it.next();
+      String[] values = new String[] { (String) generalMetaTags.get(name) };
+      // The multivalues of a metadata field are saved with a separator '\t'
+      // in the storage
+      // unless there is only one entry, where no \t is appended.
+      for (String value : values) {
+        if (values.length > 1) {
+          sb.append(value).append("\t");
+        } else {
+          sb.append(value);
+        }
+      }
+      // check whether the name is in the list of what we want or if
+      // specified *
+      if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) 
{
+        // Add the recently parsed value of multiValued array to metadata
+        LOG.debug("Found meta tag : " + name + "\t" + sb.toString());
+        page.putToMetadata(new Utf8(PARSE_META_PREFIX + name.toLowerCase()),
+            ByteBuffer.wrap(Bytes.toBytes(sb.toString())));
+      }
+    }
+
+    Properties httpequiv = metaTags.getHttpEquivTags();
+    Enumeration<?> tagNames = httpequiv.propertyNames();
+    while (tagNames.hasMoreElements()) {
+      String name = (String) tagNames.nextElement();
+      String value = httpequiv.getProperty(name);
+      // check whether the name is in the list of what we want or if
+      // specified *
+      if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) 
{
+        LOG.debug("Found meta tag : " + name + "\t" + value);
+        page.putToMetadata(new Utf8(PARSE_META_PREFIX + name.toLowerCase()),
+            ByteBuffer.wrap(value.getBytes()));
+      }
+    }
+
+    return parse;
+  }
+
+  @Override
+  public Collection<Field> getFields() {
+    return null;
+  }
+
+}

Added: 
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java?rev=1577143&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
 Thu Mar 13 12:51:12 2014
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
+import org.apache.nutch.util.NutchConfiguration;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Test;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Map;
+
+public class TestMetaTagsParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testMetatags.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  /**
+   * 
+   * @param fileName
+   *          This variable set test file.
+   * @param useUtil
+   *          If value is True method use ParseUtil
+   * @return If successfully document parsed, it return metatags
+   */
+  public Map<Utf8, ByteBuffer> parseMetaTags(String fileName, boolean useUtil) 
{
+    try {
+      Configuration conf = NutchConfiguration.create();
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+
+      File file = new File(sampleDir + fileSeparator + fileName);
+      byte[] bytes = new byte[(int) file.length()];
+      DataInputStream in = new DataInputStream(new FileInputStream(file));
+      in.readFully(bytes);
+      in.close();
+
+      WebPage page = new WebPage();
+      page.setBaseUrl(new Utf8(urlString));
+      page.setContent(ByteBuffer.wrap(bytes));
+      page.setContentType(new Utf8("text/html"));
+
+      if (useUtil) {
+        ParseUtil parser = new ParseUtil(conf);
+        parser.parse(urlString, page);
+      } else {
+        DocumentFragment node = getDOMDocument(bytes);
+        HTMLMetaTags metaTags = new HTMLMetaTags();
+        URL base = null;
+        try {
+          base = new URL(urlString);
+        } catch (MalformedURLException e) {
+          e.printStackTrace();
+        }
+        // get meta directives
+        getMetaTags(metaTags, node, base);
+
+        MetaTagsParser mtp = new MetaTagsParser();
+        mtp.setConf(conf);
+        mtp.filter(urlString, page, new Parse(), metaTags, node);
+      }
+
+      return page.getMetadata();
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.toString());
+      return null;
+    }
+  }
+
+  public static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i = 0; i < attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          String attrName = attr.getNodeName().toLowerCase();
+          if (attrName.equals("name")) {
+            nameNode = attr;
+          } else if (attrName.equals("http-equiv")) {
+            equivNode = attr;
+          } else if (attrName.equals("content")) {
+            contentNode = attr;
+          }
+        }
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().setProperty(name,
+                contentNode.getNodeValue());
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+          }
+        }
+      }
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static DocumentFragment getDOMDocument(byte[] content)
+      throws IOException, SAXException {
+    InputSource input = new InputSource(new ByteArrayInputStream(content));
+    input.setEncoding("utf-8");
+    DOMFragmentParser parser = new DOMFragmentParser();
+    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+    parser.parse(input, node);
+    return node;
+  }
+
+  /**
+   * This test use parse-html with other parse filters.
+   */
+  @Test
+  public void testMetaTagsParserWithConf() {
+    // check that we get the same values
+    Map<Utf8, ByteBuffer> meta = parseMetaTags(sampleFile, true);
+
+    assertEquals(description,
+        getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "description"));
+    assertEquals(keywords,
+        getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "keywords"));
+  }
+
+  /**
+   * This test generate custom DOM tree without parse-html for testing just
+   * parse-metatags.
+   */
+  @Test
+  public void testFilter() {
+    // check that we get the same values
+    Map<Utf8, ByteBuffer> meta = parseMetaTags(sampleFile, false);
+
+    assertEquals(description,
+        getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "description"));
+    assertEquals(keywords,
+        getMeta(meta, MetaTagsParser.PARSE_META_PREFIX + "keywords"));
+  }
+
+  private String getMeta(Map<Utf8, ByteBuffer> meta, String name) {
+    ByteBuffer raw = meta.get(new Utf8(name));
+    return Bytes.toString(raw);
+  }
+
+}

Modified: 
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?rev=1577143&r1=1577142&r2=1577143&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java 
(original)
+++ 
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java 
Thu Mar 13 12:51:12 2014
@@ -92,8 +92,6 @@ public class TestIndexingFilters {
     // set content metadata
     Metadata md = new Metadata();
     md.add("example","data");
-    // set content metadata property defined in MetadataIndexer
-    conf.set("index.content.md","example");
     // add MetadataIndxer filter
     conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
     IndexingFilters filters2 = new IndexingFilters(conf);


Reply via email to