URLMetaIndexingFilter.java

jnioche Wed, 05 Jan 2011 02:12:47 -0800

Author: jnioche
Date: Wed Jan  5 10:12:20 2011
New Revision: 1055392

URL: http://svn.apache.org/viewvc?rev=1055392&view=rev
Log:
NUTCH-855 adapted code to 1.3 and removed reference to Lucene indexer + added 
Ivy file


Added:
    nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml
Modified:
    
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java

Added: nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml?rev=1055392&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml (added)
+++ nutch/branches/branch-1.3/src/plugin/urlmeta/ivy.xml Wed Jan  5 10:12:20 
2011
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Modified: 
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java?rev=1055392&r1=1055391&r2=1055392&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
 Wed Jan  5 10:12:20 2011
@@ -26,7 +26,6 @@ import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.nutch.parse.Parse;
 
 /**
@@ -37,102 +36,83 @@ import org.apache.nutch.parse.Parse;
  * with your URLs will be indexed alongside those URLs--and can be directly
  * queried, assuming you have done everything else correctly.
  * 
- * The flat-file of URLs you are injecting should, per NUTCH-655, be 
tab-delimited 
- * in the form of:
+ * The flat-file of URLs you are injecting should, per NUTCH-655, be
+ * tab-delimited in the form of:
  * 
- *      [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
+ * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
  * 
- *      Be aware that if you collide with keywords that are already in use 
(such
- *      as nutch.score/nutch.fetchInterval) then you are in for some 
unpredictable 
- *      behavior. 
- * 
- *      Furthermore, in your nutch-site.xml config, you must specify that this
- *      plugin is to be used (1), as well as what (2) Meta Tags it should
- *      actively look for. This does not mean that you must use these tags for
- *      every URL, but it does mean that you must list _all_ of meta tags that
- *      you have specified. If you want them to be propagated and indexed, that
- *      is. 
- *      
- *      1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
- *      
<value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
- *      
-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
- *      |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
- *      "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to
- *      call this plugin.
- *      
- *      2. You must also specify the property "urlmeta.tags", who's values are
- *      comma-delimited <value>key1, key2, key3</value>
+ * Be aware that if you collide with keywords that are already in use (such as
+ * nutch.score/nutch.fetchInterval) then you are in for some unpredictable
+ * behavior.
+ * 
+ * Furthermore, in your nutch-site.xml config, you must specify that this 
plugin
+ * is to be used (1), as well as what (2) Meta Tags it should actively look 
for.
+ * This does not mean that you must use these tags for every URL, but it does
+ * mean that you must list _all_ of meta tags that you have specified. If you
+ * want them to be propagated and indexed, that is.
+ * 
+ * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
+ * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
+ * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
+ * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
+ * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call
+ * this plugin.
  * 
- *      TODO: It may be ideal to offer two separate properties, to specify what
- *      gets indexed versus merely propagated. 
+ * 2. You must also specify the property "urlmeta.tags", who's values are
+ * comma-delimited <value>key1, key2, key3</value>
+ * 
+ * TODO: It may be ideal to offer two separate properties, to specify what gets
+ * indexed versus merely propagated.
  * 
  */
 public class URLMetaIndexingFilter implements IndexingFilter {
 
-  private static final Log LOG = 
LogFactory.getLog(URLMetaIndexingFilter.class);
-  private static final String CONF_PROPERTY = "urlmeta.tags";
-  private static String[] urlMetaTags;
-  private Configuration conf;
-
-  /**
-   * This will take the metatags that you have listed in your "urlmeta.tags"
-   * property, and looks for them inside the CrawlDatum object. If they exist,
-   * this will add it as an attribute inside the NutchDocument.
-   * 
-   * @see IndexingFilter#filter
-   */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-    if (conf != null)
-      this.setConf(conf);
-
-    if (urlMetaTags == null || doc == null)
-      return doc;
-
-    for (String metatag : urlMetaTags) {
-      Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
-
-      if (metadata != null)
-        doc.add(metatag, metadata.toString());
-    }
-
-    return doc;
-  }
-
-  /**
-   * This tells the LuceneWriter that the above attributes should be part of 
its
-   * Indexing process.
-   * 
-   * @see IndexingFilter#addIndexBackendOptions
-   */
-  public void addIndexBackendOptions(Configuration conf) {
-    if (conf != null)
-      this.setConf(conf);
-
-    if (urlMetaTags == null)
-      return;
-
-    for (String metatag : urlMetaTags) {
-      LuceneWriter.addFieldOptions(metatag, LuceneWriter.STORE.YES,
-          LuceneWriter.INDEX.TOKENIZED, conf);
-    }
-  }
-
-  /** Boilerplate */
-  public Configuration getConf() {
-    return conf;
-  }
-
-  /**
-   * handles conf assignment and pulls the value assignment from the
-   * "urlmeta.tags" property
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
+       private static final Log LOG = LogFactory
+                       .getLog(URLMetaIndexingFilter.class);
+       private static final String CONF_PROPERTY = "urlmeta.tags";
+       private static String[] urlMetaTags;
+       private Configuration conf;
+
+       /**
+        * This will take the metatags that you have listed in your 
"urlmeta.tags"
+        * property, and looks for them inside the CrawlDatum object. If they 
exist,
+        * this will add it as an attribute inside the NutchDocument.
+        * 
+        * @see IndexingFilter#filter
+        */
+       public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+                       CrawlDatum datum, Inlinks inlinks) throws 
IndexingException {
+               if (conf != null)
+                       this.setConf(conf);
+
+               if (urlMetaTags == null || doc == null)
+                       return doc;
+
+               for (String metatag : urlMetaTags) {
+                       Text metadata = (Text) datum.getMetaData().get(new 
Text(metatag));
+
+                       if (metadata != null)
+                               doc.add(metatag, metadata.toString());
+               }
+
+               return doc;
+       }
+
+       /** Boilerplate */
+       public Configuration getConf() {
+               return conf;
+       }
+
+       /**
+        * handles conf assignment and pulls the value assignment from the
+        * "urlmeta.tags" property
+        */
+       public void setConf(Configuration conf) {
+               this.conf = conf;
 
-    if (conf == null)
-      return;
+               if (conf == null)
+                       return;
 
-    urlMetaTags = conf.getStrings(CONF_PROPERTY);
-  }
+               urlMetaTags = conf.getStrings(CONF_PROPERTY);
+       }
 }
\ No newline at end of file

svn commit: r1055392 - in /nutch/branches/branch-1.3/src/plugin/urlmeta: ivy.xml src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java

Reply via email to