Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change 
notification.

The "WritingPluginExample" page has been changed by LewisJohnMcgibbney:
http://wiki.apache.org/nutch/WritingPluginExample?action=diff&rev1=20&rev2=21

  </project>
  }}}
  
- == The HTML Parser Extension ==
+ == The Indexer Extension ==
- This is the source code for the HTML Parser extension.  It tries to grab the 
contents of the recommended meta tag and add them to the document being parsed.
+ This is the source code for the !IndexingFilter extension. Meta Tags that are 
included in your Crawl URLs, during injection, will be propagated throughout 
the outlinks of those Crawl URLs. This means that when you index your URLs, the 
meta tags that you specified with your URLs will be indexed alongside those 
URLs--and can be directly queried.
  
  {{{
- package org.apache.nutch.parse.recommended;
+ package org.apache.nutch.indexer.urlmeta;
  
- // JDK imports
- import java.util.Enumeration;
- import java.util.Properties;
- import java.util.logging.Logger;
- 
- // Nutch imports
- import org.apache.nutch.parse.HTMLMetaTags;
+ import org.apache.commons.logging.Log;
+ import org.apache.commons.logging.LogFactory;
+ import org.apache.hadoop.conf.Configuration;
+ import org.apache.hadoop.io.Text;
+ import org.apache.nutch.crawl.CrawlDatum;
+ import org.apache.nutch.crawl.Inlinks;
+ import org.apache.nutch.indexer.IndexingException;
+ import org.apache.nutch.indexer.IndexingFilter;
+ import org.apache.nutch.indexer.NutchDocument;
  import org.apache.nutch.parse.Parse;
- import org.apache.nutch.parse.HtmlParseFilter;
- import org.apache.nutch.protocol.Content;
- import org.apache.nutch.util.LogFormatter;
  
- public class RecommendedParser implements HtmlParseFilter {
+ public class URLMetaIndexingFilter implements IndexingFilter {
  
-   private static final Logger LOG = LogFormatter
+       private static final Log LOG = LogFactory
-     .getLogger(RecommendedParser.class.getName());
+                       .getLog(URLMetaIndexingFilter.class);
+       private static final String CONF_PROPERTY = "urlmeta.tags";
+       private static String[] urlMetaTags;
+       private Configuration conf;
  
-   /** The Recommended meta data attribute name */
-   public static final String META_RECOMMENDED_NAME="Recommended";
+       /**
+        * This will take the metatags that you have listed in your 
"urlmeta.tags"
+        * property, and looks for them inside the CrawlDatum object. If they 
exist,
+        * this will add it as an attribute inside the NutchDocument.
+        * 
+        * @see IndexingFilter#filter
+        */
+       public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+                       CrawlDatum datum, Inlinks inlinks) throws 
IndexingException {
+               if (conf != null)
+                       this.setConf(conf);
  
+               if (urlMetaTags == null || doc == null)
+                       return doc;
-   /**
-    * Scan the HTML document looking for a recommended meta tag.
-    */
-   public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
-     // Trying to find the document's recommended term
-         String recommendation = null;
  
-         Properties generalMetaTags = metaTags.getGeneralTags();
+               for (String metatag : urlMetaTags) {
+                       Text metadata = (Text) datum.getMetaData().get(new 
Text(metatag));
  
+                       if (metadata != null)
+                               doc.add(metatag, metadata.toString());
+               }
-         for (Enumeration tagNames = generalMetaTags.propertyNames(); 
tagNames.hasMoreElements(); ) {
-                         if (tagNames.nextElement().equals("recommended")) {
-                                 recommendation = 
generalMetaTags.getProperty("recommended");
-                                 LOG.info("Found a Recommendation for " + 
recommendation);
-                         }
-         }
  
+               return doc;
+       }
-         if (recommendation == null) {
-                         LOG.info("No Recommendataion");
-         } else {
-                         LOG.info("Adding Recommendation for " + 
recommendation);
-                 parse.getData().getMetadata().put(META_RECOMMENDED_NAME, 
recommendation);
-         }
  
-     return parse;
-   }
+       /** Boilerplate */
+       public Configuration getConf() {
+               return conf;
+       }
+ 
+       /**
+        * handles conf assignment and pulls the value assignment from the
+        * "urlmeta.tags" property
+        */
+       public void setConf(Configuration conf) {
+               this.conf = conf;
+ 
+               if (conf == null)
+                       return;
+ 
+               urlMetaTags = conf.getStrings(CONF_PROPERTY);
+       }
  }
  }}}
+ 
- == The Indexer Extension ==
+ == The Scoring Extension ==
  The following is the code for the Indexing Filter extension.  If the document 
being indexed had a recommended meta tag this extension adds a lucene text 
field to the index called "recommended" with the content of that meta tag.
  
  {{{

Reply via email to