Hi, All,
Attached is a patch to index and display a few meta info:
type, size and date of last-modified
if they are available.
The plugin has an unimaginative name: index-more, reflecting
its experimental nature.
Next I am going to make these meta data searchable.
I intend to commit it in two days, if there is no objection.
John
--------------------------- patch.txt.20041001 ------------------------------
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/build.xml
nutch-cvs-20040929.xing/build.xml
--- ./nutch-cvs-20040929/build.xml 2004-09-28 22:22:15.000000000 -0700
+++ nutch-cvs-20040929.xing/build.xml 2004-09-29 21:48:14.000000000 -0700
@@ -203,6 +203,7 @@
<packageset dir="${plugins.dir}/parse-mp3/src/java"/>
<packageset dir="${plugins.dir}/parse-msword/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
<packageset dir="${plugins.dir}/languageidentifier/src/java"/>
<packageset dir="${plugins.dir}/clustering-carrot2/src/java"/>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/default.properties
nutch-cvs-20040929.xing/default.properties
--- ./nutch-cvs-20040929/default.properties 2004-09-28 22:22:15.000000000 -0700
+++ nutch-cvs-20040929.xing/default.properties 2004-09-29 21:50:55.000000000 -0700
@@ -49,8 +49,9 @@
plugin.pdf=net.nutch.parse.pdf*
plugin.text=net.nutch.parse.text*
plugin.basic=net.nutch.indexer.basic*
+plugin.more=net.nutch.indexer.more*
plugin.language=net.nutch.analysis.lang*
plugin.creative=org.creativecommons.nutch*
plugins.packages=${plugin.http}:${plugin.ftp}:${plugin.file}:${plugin.html}:${plugin.mp3}:\
- ${plugin.msword}:${plugin.rtf}:${plugin.pdf}:${plugin.text}:${plugin.basic}:\
+
${plugin.msword}:${plugin.rtf}:${plugin.pdf}:${plugin.text}:${plugin.basic}:${plugin.more}:\
${plugin.language}:${plugin.creative}
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/plugin/build.xml
nutch-cvs-20040929.xing/src/plugin/build.xml
--- ./nutch-cvs-20040929/src/plugin/build.xml 2004-09-28 22:22:15.000000000 -0700
+++ nutch-cvs-20040929.xing/src/plugin/build.xml 2004-09-29 21:47:51.000000000
-0700
@@ -17,6 +17,7 @@
<ant dir="parse-rtf" target="deploy"/>
<ant dir="parse-ext" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
+ <ant dir="index-more" target="deploy"/>
<ant dir="query-basic" target="deploy"/>
<ant dir="query-site" target="deploy"/>
<ant dir="query-url" target="deploy"/>
@@ -55,6 +56,7 @@
<ant dir="parse-rtf" target="clean"/>
<ant dir="parse-ext" target="clean"/>
<ant dir="index-basic" target="clean"/>
+ <ant dir="index-more" target="clean"/>
<ant dir="query-basic" target="clean"/>
<ant dir="query-site" target="clean"/>
<ant dir="query-url" target="clean"/>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/plugin/index-more/build.xml
nutch-cvs-20040929.xing/src/plugin/index-more/build.xml
--- ./nutch-cvs-20040929/src/plugin/index-more/build.xml 1969-12-31
16:00:00.000000000 -0800
+++ nutch-cvs-20040929.xing/src/plugin/index-more/build.xml 2004-09-29
21:24:02.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="index-more" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/plugin/index-more/plugin.xml
nutch-cvs-20040929.xing/src/plugin/index-more/plugin.xml
--- ./nutch-cvs-20040929/src/plugin/index-more/plugin.xml 1969-12-31
16:00:00.000000000 -0800
+++ nutch-cvs-20040929.xing/src/plugin/index-more/plugin.xml 2004-09-29
21:30:20.000000000 -0700
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="index-more"
+ name="More Indexing Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <extension-point
+ id="net.nutch.indexer.IndexingFilter"
+ name="Nutch Indexing Filter"/>
+
+ <!--
+ <extension-point
+ id="net.nutch.searcher.QueryFilter"
+ name="Nutch Query Filter"/>
+ -->
+
+ <runtime>
+ <library name="index-more.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="net.nutch.indexer.more"
+ name="Nutch More Indexing Filter"
+ point="net.nutch.indexer.IndexingFilter">
+ <implementation id="MoreIndexingFilter"
+ class="net.nutch.indexer.more.MoreIndexingFilter"/>
+ </extension>
+
+ <!--
+ <extension id="net.nutch.searcher.site.SiteQueryFilter"
+ name="Nutch More Indexing Filter"
+ point="net.nutch.searcher.QueryFilter">
+ <implementation id="SiteQueryFilter"
+ class="net.nutch.searcher.site.SiteQueryFilter"
+ raw-fields="site"/>
+ </extension>
+ -->
+
+</plugin>
diff -Nur --exclude='*conf'
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
---
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/MoreIndexingFilter.java
2004-10-01 18:14:29.000000000 -0700
@@ -0,0 +1,182 @@
+/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.indexer.more;
+
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.MalformedPatternException;
+
+import javax.activation.MimetypesFileTypeMap;
+import javax.activation.MimeType;
+import javax.activation.MimeTypeParseException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import net.nutch.net.protocols.HttpDateFormat;
+import java.text.ParseException;
+
+import net.nutch.parse.Parse;
+
+import net.nutch.indexer.IndexingFilter;
+import net.nutch.indexer.IndexingException;
+
+import net.nutch.fetcher.FetcherOutput;
+
+import net.nutch.util.NutchConf;
+
+import net.nutch.util.LogFormatter;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import java.util.Enumeration;
+import java.util.Properties;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+//import java.net.URL;
+//import java.net.MalformedURLException;
+
+/************************************
+ * Add a few metaData properties to respective fields,
+ * so that they can be displayed by more.jsp (called by search.jsp).
+ * In future, need to make some of them searchable!
+ *
+ * @author John Xing
+ ***********************************/
+
+public class MoreIndexingFilter implements IndexingFilter {
+ public static final Logger LOG
+ = LogFormatter.getLogger(MoreIndexingFilter.class.getName());
+
+ // file name extension to mime-type map
+ static MimetypesFileTypeMap TYPE_MAP = null;
+ static {
+ try {
+ // read mime types from config file
+ InputStream is =
+ NutchConf.getConfResourceAsInputStream
+ (NutchConf.get("mime.types.file"));
+ if (is == null) {
+ LOG.warning
+ ("no mime.types.file: content-type won't be indexed.");
+ TYPE_MAP = null;
+ } else {
+ TYPE_MAP = new MimetypesFileTypeMap(is);
+ }
+
+ if (is != null)
+ is.close();
+ } catch (IOException e) {
+ LOG.log(Level.SEVERE, "Unexpected error", e);
+ }
+ }
+
+ // patterns used to extract filename from possible non-standard
+ // HTTP header "Content-Disposition" (see code below).
+ // Typically it looks like:
+ // Content-Disposition: inline; filename="foo.ppt"
+ private PatternMatcher matcher = new Perl5Matcher();
+ static Perl5Pattern patterns[] = {null, null};
+ static {
+ Perl5Compiler compiler = new Perl5Compiler();
+ try {
+ patterns[0] =
+ (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
+ patterns[1] =
+ (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
+ } catch (MalformedPatternException e) {
+ // just ignore
+ }
+ }
+
+ public Document filter(Document doc, Parse parse, FetcherOutput fo)
+ throws IndexingException {
+
+ Properties metaData = parse.getData().getMetadata();
+
+ // normalize all keys to lower case (see normalizeKeys() method below).
+ metaData = normalizeKeys(metaData);
+
+ // store Last-Modified
+ String lastModified = metaData.getProperty("last-modified");
+ if (lastModified != null) {
+ HttpDateFormat format = new HttpDateFormat();
+ try {
+ lastModified = new Long(format.toLong(lastModified)).toString();
+ } catch (ParseException e) {
+ LOG.warning("Can't parse erroneous last-modified: "+lastModified);
+ lastModified = null;
+ }
+ }
+ if (lastModified != null)
+ doc.add(Field.UnIndexed("lastModified", lastModified));
+
+ // store Content-Length
+ String contentLength = metaData.getProperty("content-length");
+ if (contentLength != null)
+ doc.add(Field.UnIndexed("contentLength", contentLength));
+
+ // index Content-Type
+ String contentType = metaData.getProperty("content-type");
+ if (TYPE_MAP == null || contentType == null)
+ return doc;
+
+ MimeType mimeType;
+ try {
+ mimeType = new MimeType(contentType);
+ } catch (MimeTypeParseException e) {
+ LOG.warning("Can't parse erroneous content-type: "+contentType);
+ return doc;
+ }
+
+ String primaryType = mimeType.getPrimaryType();
+ String subType = mimeType.getSubType();
+ // leave this for future improvement
+ //MimeTypeParameterList parameterList = mimeType.getParameters()
+
+ // primaryType and subType are stored
+ doc.add(Field.UnIndexed("primaryType", primaryType));
+ doc.add(Field.UnIndexed("subType", subType));
+
+ // reset title if we see non-standard HTTP header "Content-Disposition"
+ // it's a good indication that content provider wants filename therein
+ // be used as the title of this url.
+ String contentDisposition= metaData.getProperty("content-disposition");
+ MatchResult result;
+ for (int i=0; i<patterns.length; i++) {
+ if (matcher.contains(contentDisposition,patterns[i])) {
+ result = matcher.getMatch();
+ doc.add(Field.UnIndexed("title", result.group(1)));
+ break;
+ }
+ }
+
+ // return the modified document
+ return doc;
+ }
+
+ // keys in nutch metaData are saved as case-sensitive.
+ // However some http server implementations do not 'properly'
+ // case http headers. To deal with such 'anomalies',
+ // we normalize all keys to lower case only when necessary.
+ // But, this should NOT be done when metaData is saved, becasue
+ // there is a benefit to preserve whatever comes from server.
+ private Properties normalizeKeys(Properties old) {
+ Properties normalized = new Properties();
+
+ for (Enumeration e = old.propertyNames(); e.hasMoreElements();) {
+ String key = (String) e.nextElement();
+ String value = old.getProperty(key);
+ normalized.setProperty(key.toLowerCase(),value);
+ }
+
+ return normalized;
+ }
+
+}
diff -Nur --exclude='*conf'
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
---
./nutch-cvs-20040929/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
1969-12-31 16:00:00.000000000 -0800
+++
nutch-cvs-20040929.xing/src/plugin/index-more/src/java/net/nutch/indexer/more/package.html
2004-09-29 21:51:51.000000000 -0700
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A more indexing plugin.</p><p></p>
+</body>
+</html>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/web/jsp/more.jsp
nutch-cvs-20040929.xing/src/web/jsp/more.jsp
--- ./nutch-cvs-20040929/src/web/jsp/more.jsp 1969-12-31 16:00:00.000000000 -0800
+++ nutch-cvs-20040929.xing/src/web/jsp/more.jsp 2004-10-01 17:33:22.000000000
-0700
@@ -0,0 +1,48 @@
+<%
+ // @author John Xing
+ // show meta info (currently type, size, date of last-modified)
+ // for each hit. These info are indexed by ./src/plugin/index-more.
+
+ // do not show unless we have something
+ boolean showMore = false;
+
+ // Content-Type
+ String primaryType = detail.getValue("primaryType");
+ String subType = detail.getValue("subType");
+
+ String contentType = subType;
+ if (contentType == null)
+ contentType = primaryType;
+ if (contentType != null) {
+ contentType = "[<font color=#0000ff>" + contentType + "</font>]";
+ showMore = true;
+ } else {
+ contentType = "";
+ }
+
+ // Content-Length
+ String contentLength = detail.getValue("contentLength");
+ if (contentLength != null) {
+ contentLength = "(" + contentLength + " bytes)";
+ showMore = true;
+ } else {
+ contentLength = "";
+ }
+
+ // Last-Modified
+ String lastModified = detail.getValue("lastModified");
+ if (lastModified != null) {
+ Calendar cal = new GregorianCalendar();
+ cal.setTimeInMillis(new Long(lastModified).longValue());
+ lastModified = cal.get(Calendar.YEAR)
+ + "." + (1+cal.get(Calendar.MONTH)) // it is 0-based
+ + "." + cal.get(Calendar.DAY_OF_MONTH);
+ showMore = true;
+ } else {
+ lastModified = "";
+ }
+%>
+
+<% if (showMore) { %>
+ <br><font size=-1><nobr><%=contentType%> <%=contentLength%>
<%=lastModified%></nobr></font>
+<% } %>
diff -Nur --exclude='*conf' ./nutch-cvs-20040929/src/web/jsp/search.jsp
nutch-cvs-20040929.xing/src/web/jsp/search.jsp
--- ./nutch-cvs-20040929/src/web/jsp/search.jsp 2004-09-10 00:03:05.000000000 -0700
+++ nutch-cvs-20040929.xing/src/web/jsp/search.jsp 2004-09-30 17:54:21.000000000
-0700
@@ -180,9 +180,8 @@
if (title == null || title.equals("")) // use url for docs w/o title
title = url;
%>
- <b>
- <a href="<%=url%>"><%=Entities.encode(title)%></a>
- </b>
+ <b><a href="<%=url%>"><%=Entities.encode(title)%></a></b>
+ <%@ include file="./more.jsp" %>
<% if (!"".equals(summary)) { %>
<br><%=summary%>
<% } %>
-------------------------------------------------------
This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
Use IT products in your business? Tell us what you think of them. Give us
Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
http://productguide.itmanagersjournal.com/guidepromo.tmpl
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers