Author: lewismc Date: Sat Sep 10 23:46:00 2011 New Revision: 1167651 URL: http://svn.apache.org/viewvc?rev=1167651&view=rev Log: commit to address NUTCH-940 and update to changes.txt
Added: nutch/branches/branch-1.4/src/plugin/index-static/ nutch/branches/branch-1.4/src/plugin/index-static/build.xml nutch/branches/branch-1.4/src/plugin/index-static/ivy.xml nutch/branches/branch-1.4/src/plugin/index-static/plugin.xml nutch/branches/branch-1.4/src/plugin/index-static/src/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java Modified: nutch/branches/branch-1.4/CHANGES.txt nutch/branches/branch-1.4/conf/nutch-default.xml nutch/branches/branch-1.4/src/plugin/build.xml Modified: nutch/branches/branch-1.4/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1167651&r1=1167650&r2=1167651&view=diff ============================================================================== --- nutch/branches/branch-1.4/CHANGES.txt (original) +++ nutch/branches/branch-1.4/CHANGES.txt Sat Sep 10 23:46:00 2011 @@ -2,6 +2,12 @@ Nutch Change Log Release 1.4 - Current development +* NUTCH-940 static field plugin (Claudio Martella via lewismc) + +* NUTCH-914 Implement Apache Project Branding Requirements (lewismc) + +* NUTCH-1095 remove i18n from Nutch site to archive and legacy secton of wiki (lewismc) + * NUTCH-1101 Option to purge db_gone records with updatedb (markus) * NUTCH-1096 Empty (not null) ContentLength results in failure of fetch (Ferdy Galema via jnioche) Modified: nutch/branches/branch-1.4/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1167651&r1=1167650&r2=1167651&view=diff ============================================================================== --- nutch/branches/branch-1.4/conf/nutch-default.xml (original) +++ nutch/branches/branch-1.4/conf/nutch-default.xml Sat Sep 10 23:46:00 2011 @@ -1050,6 +1050,19 @@ </description> </property> +<!-- index-static plugin properties --> + +<property> + <name>index-static</name> + <value></value> + <description> + A simple plugin called at indexing that adds fields with static data. + You can specify a list of fieldname:fieldcontent per nutch job. + It can be useful when collections can't be created by urlpatterns, + like in subcollection, but on a job-basis. + </description> +</property> + <!-- Temporary Hadoop 0.17.x workaround. --> <property> Modified: nutch/branches/branch-1.4/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/build.xml?rev=1167651&r1=1167650&r2=1167651&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/plugin/build.xml (original) +++ nutch/branches/branch-1.4/src/plugin/build.xml Sat Sep 10 23:46:00 2011 @@ -31,6 +31,7 @@ <ant dir="index-basic" target="deploy"/> <ant dir="index-anchor" target="deploy"/> <ant dir="index-more" target="deploy"/> + <ant dir="index-static" target="deploy"/> <ant dir="languageidentifier" target="deploy"/> <ant dir="lib-http" target="deploy"/> <ant dir="lib-nekohtml" target="deploy"/> @@ -101,6 +102,7 @@ <ant dir="index-basic" target="clean"/> <ant dir="index-anchor" target="clean"/> <ant dir="index-more" target="clean"/> + <ant dir="index-static" target="clean"/> <ant dir="languageidentifier" target="clean"/> <ant dir="lib-commons-httpclient" target="clean"/> <ant dir="lib-http" target="clean"/> Added: nutch/branches/branch-1.4/src/plugin/index-static/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-static/build.xml?rev=1167651&view=auto ============================================================================== --- nutch/branches/branch-1.4/src/plugin/index-static/build.xml (added) +++ nutch/branches/branch-1.4/src/plugin/index-static/build.xml Sat Sep 10 23:46:00 2011 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-static" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: nutch/branches/branch-1.4/src/plugin/index-static/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-static/ivy.xml?rev=1167651&view=auto ============================================================================== --- nutch/branches/branch-1.4/src/plugin/index-static/ivy.xml (added) +++ nutch/branches/branch-1.4/src/plugin/index-static/ivy.xml Sat Sep 10 23:46:00 2011 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/branches/branch-1.4/src/plugin/index-static/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-static/plugin.xml?rev=1167651&view=auto ============================================================================== --- nutch/branches/branch-1.4/src/plugin/index-static/plugin.xml (added) +++ nutch/branches/branch-1.4/src/plugin/index-static/plugin.xml Sat Sep 10 23:46:00 2011 @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-static" + name="Index Static" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="index-static.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + + <extension id="org.apache.nutch.indexer.staticfield" + name="Nutch static field index" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="StaticField" + class="org.apache.nutch.indexer.staticfield.StaticFieldIndexer"/> + </extension> + +</plugin> Added: nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1167651&view=auto ============================================================================== --- nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java (added) +++ nutch/branches/branch-1.4/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java Sat Sep 10 23:46:00 2011 @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.staticfield; + +import java.util.HashMap; +import java.util.Map.Entry; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.conf.Configuration; + +public class StaticFieldIndexer implements IndexingFilter { + private Configuration conf; + private HashMap<String, String[]> fields; + private boolean addStaticFields = false; + + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) + throws IndexingException { + + if(this.addStaticFields == true){ + for(Entry<String,String[]> entry: this.fields.entrySet()){ + doc.add(entry.getKey(), entry.getValue()); + } + } + return doc; + } + + private HashMap<String, String[]> parseFields(String fieldsString) { + HashMap<String, String[]> fields = new HashMap<String, String[]>(); + + /* + The format is very easy, it's a comma-separated list of fields in the form <name>:<value> + */ + for(String field: fieldsString.split(",")){ + String[] entry = field.split(":"); + if(entry.length == 2) + fields.put(entry[0].trim(), entry[1].trim().split(" ")); + } + + return fields; + } + + public void setConf(Configuration conf) { + this.conf = conf; + String fieldsString = conf.get("index.static", null); + if(fieldsString != null){ + this.addStaticFields = true; + this.fields = parseFields(fieldsString); + } + } + + public Configuration getConf() { + return this.conf; + } +}