Author: jnioche
Date: Wed Jan 5 10:12:20 2011
New Revision: 1055392
URL: http://svn.apache.org/viewvc?rev=1055392&view=rev
Log:
NUTCH-855 adapted code to 1.3 and removed reference to Lucene indexer + added
Ivy file
Added:
nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml
Modified:
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
Added: nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml?rev=1055392&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml (added)
+++ nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml Wed Jan 5 10:12:20
2011
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Modified:
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java?rev=1055392&r1=1055391&r2=1055392&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
Wed Jan 5 10:12:20 2011
@@ -26,7 +26,6 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.parse.Parse;
/**
@@ -37,102 +36,83 @@ import org.apache.nutch.parse.Parse;
* with your URLs will be indexed alongside those URLs--and can be directly
* queried, assuming you have done everything else correctly.
*
- * The flat-file of URLs you are injecting should, per NUTCH-655, be
tab-delimited
- * in the form of:
+ * The flat-file of URLs you are injecting should, per NUTCH-655, be
+ * tab-delimited in the form of:
*
- * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
+ * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
*
- * Be aware that if you collide with keywords that are already in use
(such
- * as nutch.score/nutch.fetchInterval) then you are in for some
unpredictable
- * behavior.
- *
- * Furthermore, in your nutch-site.xml config, you must specify that this
- * plugin is to be used (1), as well as what (2) Meta Tags it should
- * actively look for. This does not mean that you must use these tags for
- * every URL, but it does mean that you must list _all_ of meta tags that
- * you have specified. If you want them to be propagated and indexed, that
- * is.
- *
- * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
- *
<value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
- *
-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
- * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
- * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to
- * call this plugin.
- *
- * 2. You must also specify the property "urlmeta.tags", who's values are
- * comma-delimited <value>key1, key2, key3</value>
+ * Be aware that if you collide with keywords that are already in use (such as
+ * nutch.score/nutch.fetchInterval) then you are in for some unpredictable
+ * behavior.
+ *
+ * Furthermore, in your nutch-site.xml config, you must specify that this
plugin
+ * is to be used (1), as well as what (2) Meta Tags it should actively look
for.
+ * This does not mean that you must use these tags for every URL, but it does
+ * mean that you must list _all_ of meta tags that you have specified. If you
+ * want them to be propagated and indexed, that is.
+ *
+ * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
+ * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
+ * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
+ * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
+ * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call
+ * this plugin.
*
- * TODO: It may be ideal to offer two separate properties, to specify what
- * gets indexed versus merely propagated.
+ * 2. You must also specify the property "urlmeta.tags", who's values are
+ * comma-delimited <value>key1, key2, key3</value>
+ *
+ * TODO: It may be ideal to offer two separate properties, to specify what gets
+ * indexed versus merely propagated.
*
*/
public class URLMetaIndexingFilter implements IndexingFilter {
- private static final Log LOG =
LogFactory.getLog(URLMetaIndexingFilter.class);
- private static final String CONF_PROPERTY = "urlmeta.tags";
- private static String[] urlMetaTags;
- private Configuration conf;
-
- /**
- * This will take the metatags that you have listed in your "urlmeta.tags"
- * property, and looks for them inside the CrawlDatum object. If they exist,
- * this will add it as an attribute inside the NutchDocument.
- *
- * @see IndexingFilter#filter
- */
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
- if (conf != null)
- this.setConf(conf);
-
- if (urlMetaTags == null || doc == null)
- return doc;
-
- for (String metatag : urlMetaTags) {
- Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
-
- if (metadata != null)
- doc.add(metatag, metadata.toString());
- }
-
- return doc;
- }
-
- /**
- * This tells the LuceneWriter that the above attributes should be part of
its
- * Indexing process.
- *
- * @see IndexingFilter#addIndexBackendOptions
- */
- public void addIndexBackendOptions(Configuration conf) {
- if (conf != null)
- this.setConf(conf);
-
- if (urlMetaTags == null)
- return;
-
- for (String metatag : urlMetaTags) {
- LuceneWriter.addFieldOptions(metatag, LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.TOKENIZED, conf);
- }
- }
-
- /** Boilerplate */
- public Configuration getConf() {
- return conf;
- }
-
- /**
- * handles conf assignment and pulls the value assignment from the
- * "urlmeta.tags" property
- */
- public void setConf(Configuration conf) {
- this.conf = conf;
+ private static final Log LOG = LogFactory
+ .getLog(URLMetaIndexingFilter.class);
+ private static final String CONF_PROPERTY = "urlmeta.tags";
+ private static String[] urlMetaTags;
+ private Configuration conf;
+
+ /**
+ * This will take the metatags that you have listed in your
"urlmeta.tags"
+ * property, and looks for them inside the CrawlDatum object. If they
exist,
+ * this will add it as an attribute inside the NutchDocument.
+ *
+ * @see IndexingFilter#filter
+ */
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws
IndexingException {
+ if (conf != null)
+ this.setConf(conf);
+
+ if (urlMetaTags == null || doc == null)
+ return doc;
+
+ for (String metatag : urlMetaTags) {
+ Text metadata = (Text) datum.getMetaData().get(new
Text(metatag));
+
+ if (metadata != null)
+ doc.add(metatag, metadata.toString());
+ }
+
+ return doc;
+ }
+
+ /** Boilerplate */
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * handles conf assignment and pulls the value assignment from the
+ * "urlmeta.tags" property
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
- if (conf == null)
- return;
+ if (conf == null)
+ return;
- urlMetaTags = conf.getStrings(CONF_PROPERTY);
- }
+ urlMetaTags = conf.getStrings(CONF_PROPERTY);
+ }
}
\ No newline at end of file