[Nutch-dev] patch to index and display a few meta info

john Fri, 01 Oct 2004 18:35:05 -0700

Hi, All,

Attached is a patch to index and display a few meta info:
type, size and date of last-modified
if they are available.


The plugin has an unimaginative name: index-more, reflecting
its experimental nature.

Next I am going to make these meta data searchable.

I intend to commit it in two days, if there is no objection.

John

--------------------------- patch.txt.20041001 ------------------------------

diff -Nur --exclude='*conf' ./nutch-cvs-20040929/build.xml 
nutch-cvs-20040929.xing/build.xml
--- ./nutch-cvs-20040929/build.xml      2004-09-28 22:22:15.000000000 -0700
+++ nutch-cvs-20040929.xing/build.xml   2004-09-29 21:48:14.000000000 -0700
@@ -203,6 +203,7 @@
        <packageset dir="${plugins.dir}/parse-mp3/src/java"/>
        <packageset dir="${plugins.dir}/parse-msword/src/java"/>
        <packageset dir="${plugins.dir}/index-basic/src/java"/>
+       <packageset dir="${plugins.dir}/index-more/src/java"/>
        <packageset dir="${plugins.dir}/creativecommons/src/java"/>
        <packageset dir="${plugins.dir}/languageidentifier/src/java"/>
        <packageset dir="${plugins.dir}/clustering-carrot2/src/java"/>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/default.properties 
nutch-cvs-20040929.xing/default.properties
--- ./nutch-cvs-20040929/default.properties     2004-09-28 22:22:15.000000000 -0700
+++ nutch-cvs-20040929.xing/default.properties  2004-09-29 21:50:55.000000000 -0700
@@ -49,8 +49,9 @@
 plugin.pdf=net.nutch.parse.pdf*
 plugin.text=net.nutch.parse.text*
 plugin.basic=net.nutch.indexer.basic*
+plugin.more=net.nutch.indexer.more*
 plugin.language=net.nutch.analysis.lang*
 plugin.creative=org.creativecommons.nutch*
 
plugins.packages=${plugin.http}:${plugin.ftp}:${plugin.file}:${plugin.html}:${plugin.mp3}:\
-       ${plugin.msword}:${plugin.rtf}:${plugin.pdf}:${plugin.text}:${plugin.basic}:\
+       
${plugin.msword}:${plugin.rtf}:${plugin.pdf}:${plugin.text}:${plugin.basic}:${plugin.more}:\
        ${plugin.language}:${plugin.creative}
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/plugin/build.xml 
nutch-cvs-20040929.xing/src/plugin/build.xml
--- ./nutch-cvs-20040929/src/plugin/build.xml   2004-09-28 22:22:15.000000000 -0700
+++ nutch-cvs-20040929.xing/src/plugin/build.xml        2004-09-29 21:47:51.000000000 
-0700
@@ -17,6 +17,7 @@
     <ant dir="parse-rtf" target="deploy"/>
     <ant dir="parse-ext" target="deploy"/>
     <ant dir="index-basic" target="deploy"/>
+    <ant dir="index-more" target="deploy"/>
     <ant dir="query-basic" target="deploy"/>
     <ant dir="query-site" target="deploy"/>
     <ant dir="query-url" target="deploy"/>
@@ -55,6 +56,7 @@
     <ant dir="parse-rtf" target="clean"/>
     <ant dir="parse-ext" target="clean"/>
     <ant dir="index-basic" target="clean"/>
+    <ant dir="index-more" target="clean"/>
     <ant dir="query-basic" target="clean"/>
     <ant dir="query-site" target="clean"/>
     <ant dir="query-url" target="clean"/>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/plugin/index-more/build.xml 
nutch-cvs-20040929.xing/src/plugin/index-more/build.xml
--- ./nutch-cvs-20040929/src/plugin/index-more/build.xml        1969-12-31 
16:00:00.000000000 -0800
+++ nutch-cvs-20040929.xing/src/plugin/index-more/build.xml     2004-09-29 
21:24:02.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="index-more" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/plugin/index-more/plugin.xml 
nutch-cvs-20040929.xing/src/plugin/index-more/plugin.xml
--- ./nutch-cvs-20040929/src/plugin/index-more/plugin.xml       1969-12-31 
16:00:00.000000000 -0800
+++ nutch-cvs-20040929.xing/src/plugin/index-more/plugin.xml    2004-09-29 
21:30:20.000000000 -0700
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="index-more"
+   name="More Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <extension-point
+      id="net.nutch.indexer.IndexingFilter"
+      name="Nutch Indexing Filter"/>
+
+   <!--
+   <extension-point
+      id="net.nutch.searcher.QueryFilter"
+      name="Nutch Query Filter"/>
+    -->
+
+   <runtime>
+      <library name="index-more.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="net.nutch.indexer.more"
+              name="Nutch More Indexing Filter"
+              point="net.nutch.indexer.IndexingFilter">
+      <implementation id="MoreIndexingFilter"
+                      class="net.nutch.indexer.more.MoreIndexingFilter"/>
+   </extension>
+
+   <!--
+   <extension id="net.nutch.searcher.site.SiteQueryFilter"
+              name="Nutch More Indexing Filter"
+              point="net.nutch.searcher.QueryFilter">
+      <implementation id="SiteQueryFilter"
+                      class="net.nutch.searcher.site.SiteQueryFilter"
+                      raw-fields="site"/>
+   </extension>
+   -->
+
+</plugin>
diff -Nur --exclude='*conf' 
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
 
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
--- 
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
  1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
       2004-10-01 18:14:29.000000000 -0700
@@ -0,0 +1,182 @@
+/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.indexer.more;
+
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.MalformedPatternException;
+
+import javax.activation.MimetypesFileTypeMap;
+import javax.activation.MimeType;
+import javax.activation.MimeTypeParseException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import net.nutch.net.protocols.HttpDateFormat;
+import java.text.ParseException;
+
+import net.nutch.parse.Parse;
+
+import net.nutch.indexer.IndexingFilter;
+import net.nutch.indexer.IndexingException;
+
+import net.nutch.fetcher.FetcherOutput;
+
+import net.nutch.util.NutchConf;
+
+import net.nutch.util.LogFormatter;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import java.util.Enumeration;
+import java.util.Properties;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+//import java.net.URL;
+//import java.net.MalformedURLException;
+
+/************************************
+ * Add a few metaData properties to respective fields,
+ * so that they can be displayed by more.jsp (called by search.jsp).
+ * In future, need to make some of them searchable!
+ *
+ * @author John Xing
+ ***********************************/
+
+public class MoreIndexingFilter implements IndexingFilter {
+  public static final Logger LOG
+    = LogFormatter.getLogger(MoreIndexingFilter.class.getName());
+
+  // file name extension to mime-type map
+  static MimetypesFileTypeMap TYPE_MAP = null;
+  static {
+    try {
+      // read mime types from config file
+      InputStream is =
+        NutchConf.getConfResourceAsInputStream
+        (NutchConf.get("mime.types.file"));
+      if (is == null) {
+        LOG.warning
+          ("no mime.types.file: content-type won't be indexed.");
+        TYPE_MAP = null;
+      } else {
+        TYPE_MAP = new MimetypesFileTypeMap(is);
+      }
+
+      if (is != null)
+        is.close();
+    } catch (IOException e) {
+      LOG.log(Level.SEVERE, "Unexpected error", e);
+    }
+  }
+
+  // patterns used to extract filename from possible non-standard
+  // HTTP header "Content-Disposition" (see code below).
+  // Typically it looks like:
+  // Content-Disposition: inline; filename="foo.ppt"
+  private PatternMatcher matcher = new Perl5Matcher();
+  static Perl5Pattern patterns[] = {null, null};
+  static {
+    Perl5Compiler compiler = new Perl5Compiler();
+    try {
+      patterns[0] =
+        (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
+      patterns[1] =
+        (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
+    } catch (MalformedPatternException e) {
+      // just ignore
+    }
+  }
+
+  public Document filter(Document doc, Parse parse, FetcherOutput fo)
+    throws IndexingException {
+
+    Properties metaData = parse.getData().getMetadata();
+
+    // normalize all keys to lower case (see normalizeKeys() method below).
+    metaData = normalizeKeys(metaData);
+    
+    // store Last-Modified
+    String lastModified = metaData.getProperty("last-modified");
+    if (lastModified != null) {
+      HttpDateFormat format = new HttpDateFormat();
+      try {
+        lastModified = new Long(format.toLong(lastModified)).toString();
+      } catch  (ParseException e) {
+        LOG.warning("Can't parse erroneous last-modified: "+lastModified);
+        lastModified = null;
+      }
+    }
+    if (lastModified != null)
+      doc.add(Field.UnIndexed("lastModified", lastModified));
+
+    // store Content-Length
+    String contentLength = metaData.getProperty("content-length");
+    if (contentLength != null)
+      doc.add(Field.UnIndexed("contentLength", contentLength));
+
+    // index Content-Type
+    String contentType = metaData.getProperty("content-type");
+    if (TYPE_MAP == null || contentType == null)
+      return doc;
+
+    MimeType mimeType;
+    try {
+      mimeType = new MimeType(contentType);
+    } catch (MimeTypeParseException e) {
+      LOG.warning("Can't parse erroneous content-type: "+contentType);
+      return doc;
+    }
+
+    String primaryType = mimeType.getPrimaryType();
+    String subType = mimeType.getSubType();
+    // leave this for future improvement
+    //MimeTypeParameterList parameterList = mimeType.getParameters()
+
+    // primaryType and subType are stored
+    doc.add(Field.UnIndexed("primaryType", primaryType));
+    doc.add(Field.UnIndexed("subType", subType));
+
+    // reset title if we see non-standard HTTP header "Content-Disposition"
+    // it's a good indication that content provider wants filename therein
+    // be used as the title of this url.
+    String contentDisposition= metaData.getProperty("content-disposition");
+    MatchResult result;
+    for (int i=0; i<patterns.length; i++) {
+      if (matcher.contains(contentDisposition,patterns[i])) {
+        result = matcher.getMatch();
+        doc.add(Field.UnIndexed("title", result.group(1)));
+        break;
+      }
+    }
+
+    // return the modified document
+    return doc;
+  }
+
+  // keys in nutch metaData are saved as case-sensitive.
+  // However some http server implementations do not 'properly'
+  // case http headers. To deal with such 'anomalies',
+  // we normalize all keys to lower case only when necessary.
+  // But, this should NOT be done when metaData is saved, becasue
+  // there is a benefit to preserve whatever comes from server.
+  private Properties normalizeKeys(Properties old) {
+    Properties normalized = new Properties();
+
+    for (Enumeration e = old.propertyNames(); e.hasMoreElements();) {
+      String key = (String) e.nextElement();
+      String value = old.getProperty(key);
+      normalized.setProperty(key.toLowerCase(),value);
+    }
+
+    return normalized;
+  }
+
+}
diff -Nur --exclude='*conf' 
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
 
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
--- 
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
     1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
  2004-09-29 21:51:51.000000000 -0700
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A more indexing plugin.</p><p></p>
+</body>
+</html>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/web/jsp/more.jsp 
nutch-cvs-20040929.xing/src/web/jsp/more.jsp
--- ./nutch-cvs-20040929/src/web/jsp/more.jsp   1969-12-31 16:00:00.000000000 -0800
+++ nutch-cvs-20040929.xing/src/web/jsp/more.jsp        2004-10-01 17:33:22.000000000 
-0700
@@ -0,0 +1,48 @@
+<%
+    // @author John Xing
+    // show meta info (currently type, size, date of last-modified)
+    // for each hit. These info are indexed by ./src/plugin/index-more.
+
+    // do not show unless we have something
+    boolean showMore = false;
+
+    // Content-Type
+    String primaryType = detail.getValue("primaryType");
+    String subType = detail.getValue("subType");
+
+    String contentType = subType;
+    if (contentType == null)
+      contentType = primaryType;
+    if (contentType != null) {
+      contentType = "[<font color=#0000ff>" + contentType + "</font>]";
+      showMore = true;
+    } else {
+      contentType = "";
+    }
+
+    // Content-Length
+    String contentLength = detail.getValue("contentLength");
+    if (contentLength != null) {
+      contentLength = "(" + contentLength + " bytes)";
+      showMore = true;
+    } else {
+      contentLength = "";
+    }
+
+    // Last-Modified
+    String lastModified = detail.getValue("lastModified");
+    if (lastModified != null) {
+      Calendar cal = new GregorianCalendar();
+      cal.setTimeInMillis(new Long(lastModified).longValue());
+      lastModified = cal.get(Calendar.YEAR)
+                  + "." + (1+cal.get(Calendar.MONTH)) // it is 0-based
+                  + "." + cal.get(Calendar.DAY_OF_MONTH);
+      showMore = true;
+    } else {
+      lastModified = "";
+    }
+%>
+
+<% if (showMore) { %>
+    <br><font size=-1><nobr><%=contentType%> <%=contentLength%> 
<%=lastModified%></nobr></font>
+<%  } %>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/web/jsp/search.jsp 
nutch-cvs-20040929.xing/src/web/jsp/search.jsp
--- ./nutch-cvs-20040929/src/web/jsp/search.jsp 2004-09-10 00:03:05.000000000 -0700
+++ nutch-cvs-20040929.xing/src/web/jsp/search.jsp      2004-09-30 17:54:21.000000000 
-0700
@@ -180,9 +180,8 @@
     if (title == null || title.equals(""))        // use url for docs w/o title
       title = url;
     %>
-    <b>
-    <a href="<%=url%>"><%=Entities.encode(title)%></a>
-    </b>
+    <b><a href="<%=url%>"><%=Entities.encode(title)%></a></b>
+    <%@ include file="./more.jsp" %>
     <% if (!"".equals(summary)) { %>
     <br><%=summary%>
     <% } %>


-------------------------------------------------------
This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
Use IT products in your business? Tell us what you think of them. Give us
Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
http://productguide.itmanagersjournal.com/guidepromo.tmpl
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

[Nutch-dev] patch to index and display a few meta info

Reply via email to