Author: jerome
Date: Fri Mar  3 14:33:29 2006
New Revision: 382948

URL: http://svn.apache.org/viewcvs?rev=382948&view=rev
Log:
Add a microformats rel-tag parser/indexer/searcher plugin (a la technorati)

Added:
    lucene/nutch/trunk/src/plugin/microformats-reltag/
    lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/microformats-reltag/src/
    lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/
    lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/
    lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
   (with props)
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
   (with props)
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
   (with props)
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
   (with props)
Modified:
    lucene/nutch/trunk/build.xml
    lucene/nutch/trunk/default.properties
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=382948&r1=382947&r2=382948&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Mar  3 14:33:29 2006
@@ -249,6 +249,7 @@
        <packageset dir="${src.dir}"/>
        <packageset dir="${plugins.dir}/lib-http/src/java"/>
        <packageset dir="${plugins.dir}/lib-parsems/src/java"/>
+       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
        <packageset dir="${plugins.dir}/ontology/src/java"/>
        <packageset dir="${plugins.dir}/protocol-file/src/java"/>
        <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=382948&r1=382947&r2=382948&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Fri Mar  3 14:33:29 2006
@@ -70,6 +70,7 @@
 # plugin.ontology=org.apache.nutch.ontology*
 plugin.parsems=org.apache.nutch.parse.ms*
 plugin.pdf=org.apache.nutch.parse.pdf*
+plugin.reltag=org.apache.nutch.microformats.reltag*
 plugin.rss=org.apache.nutch.parse.rss*
 plugin.rtf=org.apache.nutch.parse.rtf*
 plugin.site=org.apache.nutch.searcher.site*
@@ -98,6 +99,7 @@
    ${plugin.msword}:\
    ${plugin.parsems}:\
    ${plugin.pdf}:\
+   ${plugin.reltag}:\
    ${plugin.rss}:\
    ${plugin.rtf}:\
    ${plugin.site}:\

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=382948&r1=382947&r2=382948&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Mar  3 14:33:29 2006
@@ -18,6 +18,7 @@
      <ant dir="lib-lucene-analyzers" target="deploy"/>
      <ant dir="lib-nekohtml" target="deploy"/>
      <ant dir="lib-parsems" target="deploy"/>
+     <ant dir="microformats-reltag" target="deploy"/>
      <ant dir="nutch-extensionpoints" target="deploy"/>
      <ant dir="ontology" target="deploy"/>
      <ant dir="protocol-file" target="deploy"/>
@@ -86,6 +87,7 @@
     <ant dir="lib-lucene-analyzers" target="clean"/>
     <ant dir="lib-nekohtml" target="clean"/>
     <ant dir="lib-parsems" target="clean"/>
+    <ant dir="microformats-reltag" target="clean"/>
     <ant dir="nutch-extensionpoints" target="clean"/>
     <ant dir="ontology" target="clean"/>
     <ant dir="protocol-file" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml Fri Mar  3 
14:33:29 2006
@@ -0,0 +1,17 @@
+<?xml version="1.0"?>
+
+<project name="microformats-reltag" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="compile-core" inheritall="false" dir="${nutch.root}"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml Fri Mar  3 
14:33:29 2006
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="microformats-reltag"
+   name="Rel-Tag microformat Parser/Indexer/Querier"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="microformats-reltag.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
+              name="Rel-Tag parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RelTagParser"
+                      
class="org.apache.nutch.microformats.reltag.RelTagParser"/>
+   </extension>
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
+              name="Rel-Tag indexing filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="RelTagIndexingFilter"
+                      
class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
+   </extension>
+
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagQueryFilter"
+              name="Rel-Tag query filter"
+              point="org.apache.nutch.searcher.QueryFilter">
+      <implementation id="RelTagQueryFilter"
+                      
class="org.apache.nutch.microformats.reltag.RelTagQueryFilter"
+                      raw-fields="tag"/>
+   </extension>
+
+
+</plugin>
+

Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=382948&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
 Fri Mar  3 14:33:29 2006
@@ -0,0 +1,82 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.parse.Parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Lucene imports
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Document;
+
+
+/**
+ * An [EMAIL PROTECTED] org.apache.nutch.indexer.IndexingFilter} that 
+ * add <code>tag</code> field(s) to the document.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag";>
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class RelTagIndexingFilter implements IndexingFilter {
+  
+
+  private Configuration conf;
+
+
+  // Inherited JavaDoc
+  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum 
datum, Inlinks inlinks)
+    throws IndexingException {
+
+    // Check if some Rel-Tags found, possibly put there by RelTagParser
+    String[] tags = 
parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
+    if (tags != null) {
+      for (int i=0; i<tags.length; i++) {
+        doc.add(new Field("tag", tags[i],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED));
+      }
+    }
+
+    return doc;
+  }
+
+  
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+  
+}

Propchange: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=382948&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 Fri Mar  3 14:33:29 2006
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// JDK imports
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.logging.Logger;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.StringUtil;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+
+/**
+ * Adds microformat rel-tags of document if found.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag";>
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class RelTagParser implements HtmlParseFilter {
+  
+  public final static Logger LOG =
+          LogFormatter.getLogger(RelTagParser.class.getName());
+
+  public final static String REL_TAG = "Rel-Tag";
+  
+  
+  private Configuration conf = null;
+  
+  
+  /**
+   * Scan the HTML document looking at possible rel-tags
+   */
+  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
+    
+    // Trying to find the document's rel-tags
+    Parser parser = new Parser(doc);
+    Set tags = parser.getRelTags();
+    Iterator iter = tags.iterator();
+    Metadata metadata = parse.getData().getParseMeta();
+    while (iter.hasNext()) {
+      metadata.add(REL_TAG, (String) iter.next());
+    }
+    return parse;
+  }
+
+  private static class Parser {
+
+    Set tags = null;
+    
+    Parser(Node node) {
+      tags = new TreeSet();
+      parse(node);
+    }
+  
+    Set getRelTags() {
+      return tags;
+    }
+    
+    void parse(Node node) {
+
+      if (node.getNodeType() == Node.ELEMENT_NODE) {
+        // Look for <a> tag
+        if ("a".equalsIgnoreCase(node.getNodeName())) {
+          NamedNodeMap attrs = node.getAttributes();
+          Node hrefNode = attrs.getNamedItem("href");
+          // Checks that it contains a href attribute
+          if (hrefNode != null) {
+            Node relNode = attrs.getNamedItem("rel");
+            // Checks that it contains a rel attribute too
+            if (relNode != null) {
+              // Finaly checks that rel=tag
+              if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+                String tag = parseTag(hrefNode.getNodeValue());
+                if (!StringUtil.isEmpty(tag)) {
+                  tags.add(tag);
+                }
+              }
+            }
+          }
+        }
+      }
+      
+      // Recurse
+      NodeList children = node.getChildNodes();
+      for (int i=0; children != null && i<children.getLength(); i++) {
+        parse(children.item(i));
+      }
+    }
+    
+    private final static String parseTag(String url) {
+      String tag = null;
+      try {
+        URL u = new URL(url);
+        String path = u.getPath();
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), 
"UTF-8");
+      } catch (Exception e) {
+        // Malformed tag...
+        tag = null;
+      }
+      return tag;
+    }
+    
+  }
+
+
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+  
+}

Propchange: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java?rev=382948&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
 Fri Mar  3 14:33:29 2006
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.searcher.RawFieldQueryFilter;
+
+
+/**
+ * Handles <code>"tag:"<code> query clauses.
+ * 
+ * @see <a href="http://www.microformats.org/wiki/rel-tag";>
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class RelTagQueryFilter extends RawFieldQueryFilter {
+  
+  private Configuration conf;
+
+  public RelTagQueryFilter() {
+    super("tag", true, 1.0f);
+  }
+  
+  
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html?rev=382948&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
 Fri Mar  3 14:33:29 2006
@@ -0,0 +1,8 @@
+<html>
+<body>
+<p>
+A microformats <a href="http://www.microformats.org/wiki/Rel-Tag";>Rel-Tag</a>
+Parser/Indexer/Querier plugin.
+</p>
+</body>
+</html>

Propchange: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to