Re: [Nutch-dev] recording Content-Type

john Tue, 02 Mar 2004 10:11:39 -0800

Doug,

Attached is the patch. A contentType is introduced in FetcherOutput,
together with modifications in OutputThread.java, IndexSegment.java
and Fetcher.java (Fetcher.java is no longer used?). Also tweaked are
cached.jsp and search.jsp. As now, nutch will fetch/index text/plain
besides text/html. ContentType will be displayed in search results too.


John

-------------------------- patch.txt.20040302 ----------------------------------
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/Fetcher.java 
nutch-20040301-cvs/src/java/net/nutch/fetcher/Fetcher.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/Fetcher.java     2004-02-13 
11:53:48.000000000 -0800
+++ nutch-20040301-cvs/src/java/net/nutch/fetcher/Fetcher.java  2004-03-01 
14:09:22.000000000 -0800
@@ -125,14 +125,14 @@
       LOG.fine("found " + outlinks.length + " outlinks in " + url);
 
       outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
-                                   true, title, outlinks),
+                                   true, title, outlinks, contentType),
                  new FetcherContent(response.getContent()),
                  new FetcherText(text));
     }
 
     private void handleNoFetch(FetchListEntry fle, boolean success) {
       outputPage(new FetcherOutput(fle, 
MD5Hash.digest(fle.getPage().getURL().toString()),
-                                   success, "", new Outlink[0]),
+                                   success, "", new Outlink[0], ""),
                  new FetcherContent(new byte[0]),
                  new FetcherText(""));
     }
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/FetcherOutput.java 
nutch-20040301-cvs/src/java/net/nutch/fetcher/FetcherOutput.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/FetcherOutput.java       
2003-07-01 13:36:07.000000000 -0700
+++ nutch-20040301-cvs/src/java/net/nutch/fetcher/FetcherOutput.java    2004-03-01 
22:26:15.000000000 -0800
@@ -23,7 +23,7 @@
   public static final String DONE_NAME = "fetcher.done";
   public static final String ERROR_NAME = "fetcher.error";
 
-  private final static byte VERSION = 2;
+  private final static byte VERSION = 3;
 
   private FetchListEntry fetchListEntry;
   private MD5Hash md5Hash;
@@ -31,18 +31,20 @@
   private String title = "";
   private Outlink[] outlinks;
   private long fetchDate;
+  private String contentType = "";
 
   public FetcherOutput() {}
 
   public FetcherOutput(FetchListEntry fetchListEntry,
                        MD5Hash md5Hash, boolean success, String title,
-                       Outlink[] outlinks) {
+                       Outlink[] outlinks, String contentType) {
     this.fetchListEntry = fetchListEntry;
     this.md5Hash = md5Hash;
     this.success = success;
     this.title = title != null ? title : "";
     this.outlinks = outlinks;
     this.fetchDate = System.currentTimeMillis();
+    this.contentType = (contentType != null) ? contentType : "";
   }
 
   public byte getVersion() { return VERSION; }
@@ -65,6 +67,8 @@
     }
 
     fetchDate = (version > 1) ? in.readLong() : 0; // added in version=2
+
+    contentType = (version > 2) ? UTF8.readString(in) : ""; //added in version=3
   }
 
   public void write(DataOutput out) throws IOException {
@@ -78,6 +82,7 @@
       outlinks[i].write(out);
     }
     out.writeLong(fetchDate);
+    UTF8.writeString(out, contentType);
   }
 
   public static FetcherOutput read(DataInput in) throws IOException {
@@ -96,6 +101,7 @@
   public Outlink[] getOutlinks() { return outlinks; }
   public long getFetchDate() { return fetchDate; }
   public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; }
+  public String getContentType() { return contentType; }
 
 
   public boolean equals(Object o) {
@@ -107,7 +113,8 @@
       this.md5Hash.equals(other.md5Hash) &&
       (this.success == other.success) &&
       this.title.equals(other.title) &&
-      Arrays.equals(this.outlinks, other.outlinks);
+      Arrays.equals(this.outlinks, other.outlinks) &&
+      this.contentType.equals(other.contentType);
   }
 
 
@@ -122,6 +129,7 @@
        buffer.append("  outlink: " + outlinks[i] + "\n");
     }
     buffer.append("FetchDate: " + new Date(fetchDate) + "\n" );
+    buffer.append("ContentType: " + contentType + "\n" );
     return buffer.toString();
   }
 
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/OutputThread.java 
nutch-20040301-cvs/src/java/net/nutch/fetcher/OutputThread.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/OutputThread.java        
2004-02-13 11:53:48.000000000 -0800
+++ nutch-20040301-cvs/src/java/net/nutch/fetcher/OutputThread.java     2004-03-01 
23:32:42.000000000 -0800
@@ -125,7 +125,7 @@
       try {
         if ( (outputEntry.getResponse() != null) 
              && (!outputEntry.getHasFailed()) ) {
-          handleFetch(outputEntry.getURL(), outputEntry.getFetchListEntry(),
+          handleFetchAny(outputEntry.getURL(), outputEntry.getFetchListEntry(),
                       outputEntry.getResponse());
         } else {
           boolean succeeded= (!outputEntry.getHasFailed() 
@@ -159,14 +159,37 @@
                                      prevUrlString);
   }
 
-  private void handleFetch(URL url, FetchListEntry fle,
-                           Response response)
-    throws IOException, SAXException, UnhandledContentTypeException,
-    DOMErrorException {
+  // 20031228, xing, dispatcher for various Content-Type
+  private void handleFetchAny(URL url, FetchListEntry fle, Response response)
+    throws IOException, SAXException,
+    UnhandledContentTypeException, DOMErrorException {
+
+    String path = url.getPath();
     String contentType = response.getHeader("Content-Type");
-    if (contentType != null && !contentType.startsWith("text/html"))
+
+    if (contentType.startsWith("text/html")) {
+      try {
+        handleFetchHtml(url, fle, response);
+      } catch (IOException e) {
+        throw e;
+      } catch (SAXException e) {
+        throw e;
+      } catch (DOMErrorException e) {
+        throw e;
+      }
+    } else if (contentType.startsWith("text/plain")) {
+      handleFetchPlain(url, fle, response);
+    } else {
       throw new UnhandledContentTypeException(contentType);
-      
+    }
+
+  }
+
+  private void handleFetchHtml(URL url, FetchListEntry fle,
+                           Response response)
+    throws IOException, SAXException, DOMErrorException {
+    String contentType = response.getHeader("Content-Type");
+
     DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
 
     try {
@@ -223,19 +246,29 @@
     }
 
     outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
-                                 true, title, outlinks),
+                                 true, title, outlinks, contentType),
                new FetcherContent(content),
                new FetcherText(text));
   }
 
+  // 20040228, xing, text/plain, save leading part of raw content only
+  private void handleFetchPlain(URL url, FetchListEntry fle, Response response) {
+    String contentType = response.getHeader("Content-Type");
+    byte[] content = response.getContent();
+
+    outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
+                                  true, "", new Outlink[0], contentType),
+               new FetcherContent(content),
+               new FetcherText(""));
+  }
+
   private void handleNoFetch(FetchListEntry fle, boolean success) {
     outputPage(new FetcherOutput(fle, 
MD5Hash.digest(fle.getPage().getURL().toString()),
-                                 success, "", new Outlink[0]),
+                                 success, "", new Outlink[0], ""),
                new FetcherContent(new byte[0]),
                new FetcherText(""));
   }
 
-
   private void outputPage(FetcherOutput fo, FetcherContent raw,
                           FetcherText stripped) {
     try {
@@ -248,7 +281,5 @@
       LOG.severe("error writing output:" + t.toString());
     }
   }
-                                       
-                     
-
+ 
 }
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/indexer/IndexSegment.java 
nutch-20040301-cvs/src/java/net/nutch/indexer/IndexSegment.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/indexer/IndexSegment.java        
2004-02-01 09:33:48.000000000 -0800
+++ nutch-20040301-cvs/src/java/net/nutch/indexer/IndexSegment.java     2004-03-01 
22:44:22.000000000 -0800
@@ -50,25 +50,29 @@
     writer.setSimilarity(new NutchSimilarity());
 
     ArrayFile.Reader fetcher = null;
+    ArrayFile.Reader content = null;
     ArrayFile.Reader text = null;
 
     int count = 0;
     try {
       fetcher = new ArrayFile.Reader(new File(directory, 
FetcherOutput.DIR_NAME).toString());
+      content = new ArrayFile.Reader(new File(directory, 
FetcherContent.DIR_NAME).toString());
       text = new ArrayFile.Reader(new 
File(directory,FetcherText.DIR_NAME).toString());
 
       String segmentName = directory.getCanonicalFile().getName();
       FetcherOutput fetcherOutput = new FetcherOutput();
+      FetcherContent fetcherContent = new FetcherContent();
       FetcherText fetcherText = new FetcherText();
 
       while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) {
+        content.next(fetcherContent);
         text.next(fetcherText);
         
         if (!fetcherOutput.getSuccess())          // if the fetch failed
           continue;                               // don't index the page
 
         Document doc = makeDocument(segmentName, fetcher.key(),
-                                    fetcherOutput, fetcherText);
+                                    fetcherOutput, fetcherContent, fetcherText);
         writer.addDocument(doc);
       }
     } catch (EOFException e) {
@@ -77,6 +81,8 @@
     } finally {
       if (fetcher != null)
         fetcher.close();
+      if (content != null)
+        content.close();
       if (text != null)
         text.close();
     }
@@ -87,12 +93,14 @@
 
   private Document makeDocument(String segmentName, long docNo,
                                 FetcherOutput fetcherOutput,
+                                FetcherContent fetcherContent,
                                 FetcherText fetcherText)
     throws Exception {
 
     FetchListEntry fle = fetcherOutput.getFetchListEntry();
     String url = fle.getPage().getURL().toString();
     String title = fetcherOutput.getTitle();
+    String contentType = fetcherOutput.getContentType();
 
     if (title.length() > maxTitleLength) {        // truncate title if needed
       title = title.substring(0, maxTitleLength);
@@ -108,9 +116,17 @@
     doc.add(Field.UnIndexed("digest", fetcherOutput.getMD5Hash().toString()));
     doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16)));
     doc.add(Field.UnIndexed("segment", segmentName));
+    doc.add(Field.UnIndexed("contentType", contentType));
 
     // content is indexed, so that it's searchable, but not stored in index
-    doc.add(Field.UnStored("content", fetcherText.getText()));
+    byte[] raw = fetcherContent.getContent();
+    if (contentType.equals("text/html")) {
+      doc.add(Field.UnStored("content", fetcherText.getText()));
+    } else if (contentType.equals("text/plain")) {
+      doc.add(Field.UnStored("content", new String(raw)));
+    } else {
+      doc.add(Field.UnStored("content", fetcherText.getText()));
+    }
     
     // anchors are indexed, so they're searchable, but not stored in index
     String[] anchors = fle.getAnchors();
diff -Nur nutch-20040301-cvs.orig/src/web/jsp/cached.jsp 
nutch-20040301-cvs/src/web/jsp/cached.jsp
--- nutch-20040301-cvs.orig/src/web/jsp/cached.jsp      2003-09-05 14:01:47.000000000 
-0700
+++ nutch-20040301-cvs/src/web/jsp/cached.jsp   2004-03-02 08:58:57.000000000 -0800
@@ -15,8 +15,17 @@
     ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
     .getLocale().getLanguage();
 
+  String contentType = details.getValue("contentType");
+
+  byte[] raw = bean.getContent(details);
+  String content = null;
+
   // FIXME: should check that it's html content, check encoding, etc.
-  String content = new String(bean.getContent(details));
+  if (contentType.equals("text/html")) {
+    content = new String(raw);
+  } else if (contentType.equals("text/plain")) {
+    content = "<pre>\n" + (new String(raw)) + "</pre>\n";
+  }
 %><base href="<%=details.getValue("url")%>">
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.cached"/>
diff -Nur nutch-20040301-cvs.orig/src/web/jsp/search.jsp 
nutch-20040301-cvs/src/web/jsp/search.jsp
--- nutch-20040301-cvs.orig/src/web/jsp/search.jsp      2004-02-03 14:06:45.000000000 
-0800
+++ nutch-20040301-cvs/src/web/jsp/search.jsp   2004-03-01 16:41:21.000000000 -0800
@@ -77,13 +77,20 @@
     HitDetails detail = details[i];
     String title = detail.getValue("title");
     String url = detail.getValue("url");
+    String contentType = detail.getValue("contentType");
     String summary = summaries[i];
     String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
 
     if (title == null || title.equals(""))        // use url for docs w/o title
       title = url;
+
+    if (contentType == null || contentType.equals("")) {
+      contentType = "";
+    } else {
+      contentType = "[" + contentType + "]";
+    }
     %>
-    <br><br><b>
+    <br><br><font color=#0000ff><%=contentType%></font><b>
     <a href="<%=url%>"><%=Entities.encode(title)%></a>
     </b>
     <% if (!"".equals(summary)) { %>


-------------------------------------------------------
SF.Net is sponsored by: Speed Start Your Linux Apps Now.
Build and deploy apps & Web services for Linux with
a free DVD software kit from IBM. Click Now!
http://ads.osdn.com/?ad_id=1356&alloc_id=3438&op=click
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

Re: [Nutch-dev] recording Content-Type

Reply via email to