Doug,
Attached is the patch. A contentType is introduced in FetcherOutput,
together with modifications in OutputThread.java, IndexSegment.java
and Fetcher.java (Fetcher.java is no longer used?). Also tweaked are
cached.jsp and search.jsp. As now, nutch will fetch/index text/plain
besides text/html. ContentType will be displayed in search results too.
John
-------------------------- patch.txt.20040302 ----------------------------------
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/Fetcher.java
nutch-20040301-cvs/src/java/net/nutch/fetcher/Fetcher.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/Fetcher.java 2004-02-13
11:53:48.000000000 -0800
+++ nutch-20040301-cvs/src/java/net/nutch/fetcher/Fetcher.java 2004-03-01
14:09:22.000000000 -0800
@@ -125,14 +125,14 @@
LOG.fine("found " + outlinks.length + " outlinks in " + url);
outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
- true, title, outlinks),
+ true, title, outlinks, contentType),
new FetcherContent(response.getContent()),
new FetcherText(text));
}
private void handleNoFetch(FetchListEntry fle, boolean success) {
outputPage(new FetcherOutput(fle,
MD5Hash.digest(fle.getPage().getURL().toString()),
- success, "", new Outlink[0]),
+ success, "", new Outlink[0], ""),
new FetcherContent(new byte[0]),
new FetcherText(""));
}
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/FetcherOutput.java
nutch-20040301-cvs/src/java/net/nutch/fetcher/FetcherOutput.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/FetcherOutput.java
2003-07-01 13:36:07.000000000 -0700
+++ nutch-20040301-cvs/src/java/net/nutch/fetcher/FetcherOutput.java 2004-03-01
22:26:15.000000000 -0800
@@ -23,7 +23,7 @@
public static final String DONE_NAME = "fetcher.done";
public static final String ERROR_NAME = "fetcher.error";
- private final static byte VERSION = 2;
+ private final static byte VERSION = 3;
private FetchListEntry fetchListEntry;
private MD5Hash md5Hash;
@@ -31,18 +31,20 @@
private String title = "";
private Outlink[] outlinks;
private long fetchDate;
+ private String contentType = "";
public FetcherOutput() {}
public FetcherOutput(FetchListEntry fetchListEntry,
MD5Hash md5Hash, boolean success, String title,
- Outlink[] outlinks) {
+ Outlink[] outlinks, String contentType) {
this.fetchListEntry = fetchListEntry;
this.md5Hash = md5Hash;
this.success = success;
this.title = title != null ? title : "";
this.outlinks = outlinks;
this.fetchDate = System.currentTimeMillis();
+ this.contentType = (contentType != null) ? contentType : "";
}
public byte getVersion() { return VERSION; }
@@ -65,6 +67,8 @@
}
fetchDate = (version > 1) ? in.readLong() : 0; // added in version=2
+
+ contentType = (version > 2) ? UTF8.readString(in) : ""; //added in version=3
}
public void write(DataOutput out) throws IOException {
@@ -78,6 +82,7 @@
outlinks[i].write(out);
}
out.writeLong(fetchDate);
+ UTF8.writeString(out, contentType);
}
public static FetcherOutput read(DataInput in) throws IOException {
@@ -96,6 +101,7 @@
public Outlink[] getOutlinks() { return outlinks; }
public long getFetchDate() { return fetchDate; }
public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; }
+ public String getContentType() { return contentType; }
public boolean equals(Object o) {
@@ -107,7 +113,8 @@
this.md5Hash.equals(other.md5Hash) &&
(this.success == other.success) &&
this.title.equals(other.title) &&
- Arrays.equals(this.outlinks, other.outlinks);
+ Arrays.equals(this.outlinks, other.outlinks) &&
+ this.contentType.equals(other.contentType);
}
@@ -122,6 +129,7 @@
buffer.append(" outlink: " + outlinks[i] + "\n");
}
buffer.append("FetchDate: " + new Date(fetchDate) + "\n" );
+ buffer.append("ContentType: " + contentType + "\n" );
return buffer.toString();
}
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/OutputThread.java
nutch-20040301-cvs/src/java/net/nutch/fetcher/OutputThread.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/fetcher/OutputThread.java
2004-02-13 11:53:48.000000000 -0800
+++ nutch-20040301-cvs/src/java/net/nutch/fetcher/OutputThread.java 2004-03-01
23:32:42.000000000 -0800
@@ -125,7 +125,7 @@
try {
if ( (outputEntry.getResponse() != null)
&& (!outputEntry.getHasFailed()) ) {
- handleFetch(outputEntry.getURL(), outputEntry.getFetchListEntry(),
+ handleFetchAny(outputEntry.getURL(), outputEntry.getFetchListEntry(),
outputEntry.getResponse());
} else {
boolean succeeded= (!outputEntry.getHasFailed()
@@ -159,14 +159,37 @@
prevUrlString);
}
- private void handleFetch(URL url, FetchListEntry fle,
- Response response)
- throws IOException, SAXException, UnhandledContentTypeException,
- DOMErrorException {
+ // 20031228, xing, dispatcher for various Content-Type
+ private void handleFetchAny(URL url, FetchListEntry fle, Response response)
+ throws IOException, SAXException,
+ UnhandledContentTypeException, DOMErrorException {
+
+ String path = url.getPath();
String contentType = response.getHeader("Content-Type");
- if (contentType != null && !contentType.startsWith("text/html"))
+
+ if (contentType.startsWith("text/html")) {
+ try {
+ handleFetchHtml(url, fle, response);
+ } catch (IOException e) {
+ throw e;
+ } catch (SAXException e) {
+ throw e;
+ } catch (DOMErrorException e) {
+ throw e;
+ }
+ } else if (contentType.startsWith("text/plain")) {
+ handleFetchPlain(url, fle, response);
+ } else {
throw new UnhandledContentTypeException(contentType);
-
+ }
+
+ }
+
+ private void handleFetchHtml(URL url, FetchListEntry fle,
+ Response response)
+ throws IOException, SAXException, DOMErrorException {
+ String contentType = response.getHeader("Content-Type");
+
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
@@ -223,19 +246,29 @@
}
outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
- true, title, outlinks),
+ true, title, outlinks, contentType),
new FetcherContent(content),
new FetcherText(text));
}
+ // 20040228, xing, text/plain, save leading part of raw content only
+ private void handleFetchPlain(URL url, FetchListEntry fle, Response response) {
+ String contentType = response.getHeader("Content-Type");
+ byte[] content = response.getContent();
+
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
+ true, "", new Outlink[0], contentType),
+ new FetcherContent(content),
+ new FetcherText(""));
+ }
+
private void handleNoFetch(FetchListEntry fle, boolean success) {
outputPage(new FetcherOutput(fle,
MD5Hash.digest(fle.getPage().getURL().toString()),
- success, "", new Outlink[0]),
+ success, "", new Outlink[0], ""),
new FetcherContent(new byte[0]),
new FetcherText(""));
}
-
private void outputPage(FetcherOutput fo, FetcherContent raw,
FetcherText stripped) {
try {
@@ -248,7 +281,5 @@
LOG.severe("error writing output:" + t.toString());
}
}
-
-
-
+
}
diff -Nur nutch-20040301-cvs.orig/src/java/net/nutch/indexer/IndexSegment.java
nutch-20040301-cvs/src/java/net/nutch/indexer/IndexSegment.java
--- nutch-20040301-cvs.orig/src/java/net/nutch/indexer/IndexSegment.java
2004-02-01 09:33:48.000000000 -0800
+++ nutch-20040301-cvs/src/java/net/nutch/indexer/IndexSegment.java 2004-03-01
22:44:22.000000000 -0800
@@ -50,25 +50,29 @@
writer.setSimilarity(new NutchSimilarity());
ArrayFile.Reader fetcher = null;
+ ArrayFile.Reader content = null;
ArrayFile.Reader text = null;
int count = 0;
try {
fetcher = new ArrayFile.Reader(new File(directory,
FetcherOutput.DIR_NAME).toString());
+ content = new ArrayFile.Reader(new File(directory,
FetcherContent.DIR_NAME).toString());
text = new ArrayFile.Reader(new
File(directory,FetcherText.DIR_NAME).toString());
String segmentName = directory.getCanonicalFile().getName();
FetcherOutput fetcherOutput = new FetcherOutput();
+ FetcherContent fetcherContent = new FetcherContent();
FetcherText fetcherText = new FetcherText();
while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) {
+ content.next(fetcherContent);
text.next(fetcherText);
if (!fetcherOutput.getSuccess()) // if the fetch failed
continue; // don't index the page
Document doc = makeDocument(segmentName, fetcher.key(),
- fetcherOutput, fetcherText);
+ fetcherOutput, fetcherContent, fetcherText);
writer.addDocument(doc);
}
} catch (EOFException e) {
@@ -77,6 +81,8 @@
} finally {
if (fetcher != null)
fetcher.close();
+ if (content != null)
+ content.close();
if (text != null)
text.close();
}
@@ -87,12 +93,14 @@
private Document makeDocument(String segmentName, long docNo,
FetcherOutput fetcherOutput,
+ FetcherContent fetcherContent,
FetcherText fetcherText)
throws Exception {
FetchListEntry fle = fetcherOutput.getFetchListEntry();
String url = fle.getPage().getURL().toString();
String title = fetcherOutput.getTitle();
+ String contentType = fetcherOutput.getContentType();
if (title.length() > maxTitleLength) { // truncate title if needed
title = title.substring(0, maxTitleLength);
@@ -108,9 +116,17 @@
doc.add(Field.UnIndexed("digest", fetcherOutput.getMD5Hash().toString()));
doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16)));
doc.add(Field.UnIndexed("segment", segmentName));
+ doc.add(Field.UnIndexed("contentType", contentType));
// content is indexed, so that it's searchable, but not stored in index
- doc.add(Field.UnStored("content", fetcherText.getText()));
+ byte[] raw = fetcherContent.getContent();
+ if (contentType.equals("text/html")) {
+ doc.add(Field.UnStored("content", fetcherText.getText()));
+ } else if (contentType.equals("text/plain")) {
+ doc.add(Field.UnStored("content", new String(raw)));
+ } else {
+ doc.add(Field.UnStored("content", fetcherText.getText()));
+ }
// anchors are indexed, so they're searchable, but not stored in index
String[] anchors = fle.getAnchors();
diff -Nur nutch-20040301-cvs.orig/src/web/jsp/cached.jsp
nutch-20040301-cvs/src/web/jsp/cached.jsp
--- nutch-20040301-cvs.orig/src/web/jsp/cached.jsp 2003-09-05 14:01:47.000000000
-0700
+++ nutch-20040301-cvs/src/web/jsp/cached.jsp 2004-03-02 08:58:57.000000000 -0800
@@ -15,8 +15,17 @@
ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
.getLocale().getLanguage();
+ String contentType = details.getValue("contentType");
+
+ byte[] raw = bean.getContent(details);
+ String content = null;
+
// FIXME: should check that it's html content, check encoding, etc.
- String content = new String(bean.getContent(details));
+ if (contentType.equals("text/html")) {
+ content = new String(raw);
+ } else if (contentType.equals("text/plain")) {
+ content = "<pre>\n" + (new String(raw)) + "</pre>\n";
+ }
%><base href="<%=details.getValue("url")%>">
<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
<i18n:bundle baseName="org.nutch.jsp.cached"/>
diff -Nur nutch-20040301-cvs.orig/src/web/jsp/search.jsp
nutch-20040301-cvs/src/web/jsp/search.jsp
--- nutch-20040301-cvs.orig/src/web/jsp/search.jsp 2004-02-03 14:06:45.000000000
-0800
+++ nutch-20040301-cvs/src/web/jsp/search.jsp 2004-03-01 16:41:21.000000000 -0800
@@ -77,13 +77,20 @@
HitDetails detail = details[i];
String title = detail.getValue("title");
String url = detail.getValue("url");
+ String contentType = detail.getValue("contentType");
String summary = summaries[i];
String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
if (title == null || title.equals("")) // use url for docs w/o title
title = url;
+
+ if (contentType == null || contentType.equals("")) {
+ contentType = "";
+ } else {
+ contentType = "[" + contentType + "]";
+ }
%>
- <br><br><b>
+ <br><br><font color=#0000ff><%=contentType%></font><b>
<a href="<%=url%>"><%=Entities.encode(title)%></a>
</b>
<% if (!"".equals(summary)) { %>
-------------------------------------------------------
SF.Net is sponsored by: Speed Start Your Linux Apps Now.
Build and deploy apps & Web services for Linux with
a free DVD software kit from IBM. Click Now!
http://ads.osdn.com/?ad_id=1356&alloc_id=3438&op=click
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers