On Sat, 17 Jul 2004, Jungshik Shin wrote:
> Jungshik Shin wrote:
>
> >
> > Attached is a patch for bug 974883 (handling query strings outside
> > Latin-1/ISO-8859-1 repertoire). Somehow I can't attach it at the
> > sourceforge bug tracking system.
> > It should work, but somehow it still doesn't work.
>
> With directives at the beginning of JSP files (e.g. search.jsp) shown
> below,
>
> <%@ page
> contentType="text/html; charset=UTF-8"
> pageEncoding="UTF-8"
> ..... %>,
>
> everything should be fine. Unfortunately, something very strange is
> going on and it only works
.....
> From the W3C International mailing list, I got to know a part of the
> picture (using JSTL - nutch web front-end uses apache i18n) can override
> 'contentType' page directive. So can 'custom filters'.
Upgrading to tomcat 5.x (or any other container implementing JSP
2.0/servlet 2.4 spec) would solve the problem. However, not everyone
uses such a container so that I added a work-around mentioned at
http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
(see the last few lines of the section 'Handling Web Page Encoding')
With this patch, any Unicode character can be used in a query. It'd be
nice to see this checked in, which is a step closer to Nutch being a
well-I18Nized search engine.
Jungshik
P.S. At http://pippin.kaist.ac.kr:8080, you can test it.
? src/web/jsp/search.jsp?query=1298
Index: src/web/include/style.html
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/include/style.html,v
retrieving revision 1.4
diff -u -7 -p -r1.4 style.html
--- src/web/include/style.html 3 Feb 2004 22:06:45 -0000 1.4
+++ src/web/include/style.html 17 Jul 2004 04:21:13 -0000
@@ -6,8 +6,9 @@
.bodytext { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color:
#000000; text-decoration: none}
.title { font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: #FF9900;
text-decoration: none}
.intro { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #FF9900;
text-decoration: none}
.orangeTd {background-color: #FF9900}
ul {list-style-image: url(../img/reiter/ul.gif)}
h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #000000;}
h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #000000;}
+.url {color: #996600;}
</style>
Index: src/web/jsp/anchors.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/anchors.jsp,v
retrieving revision 1.9
diff -u -7 -p -r1.9 anchors.jsp
--- src/web/jsp/anchors.jsp 5 Sep 2003 21:01:47 -0000 1.9
+++ src/web/jsp/anchors.jsp 17 Jul 2004 04:21:13 -0000
@@ -1,29 +1,44 @@
-<%@ page
+<%@ page
+ contentType="text/html; charset=UTF-8"
+ pageEncoding="UTF-8"
+
import="javax.servlet.*"
import="javax.servlet.http.*"
import="java.io.*"
import="java.util.*"
import="net.nutch.html.Entities"
import="net.nutch.searcher.*"
%><%
NutchBean bean = NutchBean.get(application);
+ // set the character encoding to use when interpreting request values
+ request.setCharacterEncoding("UTF-8");
bean.LOG.info("anchors request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
Integer.parseInt(request.getParameter("id")), 0.0f);
HitDetails details = bean.getDetails(hit);
String language =
ResourceBundle.getBundle("org.nutch.jsp.anchors", request.getLocale())
.getLocale().getLanguage();
String requestURI = HttpUtils.getRequestURL(request).toString();
String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<%
+ // To prevent the character encoding declared with 'contentType' page
+ // directive from being overriden by JSTL (apache i18n), we freeze it
+ // by flushing the output buffer.
+ // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+ out.flush();
+%>
<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
<i18n:bundle baseName="org.nutch.jsp.anchors"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<head>
<title>Nutch: <i18n:message key="title"/></title>
<jsp:include page="/include/style.html"/>
<base href="<%= base + "/" + language + "/" %>">
</head>
<body>
Index: src/web/jsp/explain.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/explain.jsp,v
retrieving revision 1.8
diff -u -7 -p -r1.8 explain.jsp
--- src/web/jsp/explain.jsp 5 Sep 2003 21:01:47 -0000 1.8
+++ src/web/jsp/explain.jsp 17 Jul 2004 04:21:13 -0000
@@ -1,28 +1,43 @@
-<%@ page
+<%@ page
+ contentType="text/html; charset=UTF-8"
+ pageEncoding="UTF-8"
+
import="javax.servlet.*"
import="javax.servlet.http.*"
import="java.io.*"
import="java.util.*"
import="net.nutch.searcher.*"
%><%
NutchBean bean = NutchBean.get(application);
+ // set the character encoding to use when interpreting request values
+ request.setCharacterEncoding("UTF-8");
bean.LOG.info("explain request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
Integer.parseInt(request.getParameter("id")), 0.0f);
HitDetails details = bean.getDetails(hit);
Query query = Query.parse(request.getParameter("query"));
String language =
ResourceBundle.getBundle("org.nutch.jsp.explain", request.getLocale())
.getLocale().getLanguage();
String requestURI = HttpUtils.getRequestURL(request).toString();
String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<%
+ // To prevent the character encoding declared with 'contentType' page
+ // directive from being overriden by JSTL (apache i18n), we freeze it
+ // by flushing the output buffer.
+ // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+ out.flush();
+%>
<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
<i18n:bundle baseName="org.nutch.jsp.explain"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<head>
<title>Nutch: <i18n:message key="title"/></title>
<jsp:include page="/include/style.html"/>
<base href="<%= base + "/" + language %>/">
</head>
<body>
Index: src/web/jsp/search.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/search.jsp,v
retrieving revision 1.24
diff -u -7 -p -r1.24 search.jsp
--- src/web/jsp/search.jsp 20 May 2004 20:27:16 -0000 1.24
+++ src/web/jsp/search.jsp 17 Jul 2004 04:21:13 -0000
@@ -1,18 +1,24 @@
-<%@ page
+<%@ page
+ contentType="text/html; charset=UTF-8"
+ pageEncoding="UTF-8"
+
import="javax.servlet.*"
import="javax.servlet.http.*"
import="java.io.*"
import="java.util.*"
import="java.net.*"
import="net.nutch.html.Entities"
import="net.nutch.searcher.*"
%><%
NutchBean bean = NutchBean.get(application);
+ // set the character encoding to use when interpreting request values
+ request.setCharacterEncoding("UTF-8");
+
bean.LOG.info("query request from " + request.getRemoteAddr());
// get query from request
String queryString = request.getParameter("query");
if (queryString == null)
throw new ServletException("no query specified");
String htmlQueryString =
net.nutch.html.Entities.encode(request.getParameter("query"));
@@ -32,33 +38,43 @@
bean.LOG.info("query: " + queryString);
String language =
ResourceBundle.getBundle("org.nutch.jsp.search", request.getLocale())
.getLocale().getLanguage();
String requestURI = HttpUtils.getRequestURL(request).toString();
String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<%
+ // To prevent the character encoding declared with 'contentType' page
+ // directive from being overriden by JSTL (apache i18n), we freeze it
+ // by flushing the output buffer.
+ // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+ out.flush();
+%>
<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
<i18n:bundle baseName="org.nutch.jsp.search"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<head>
<title>Nutch: <i18n:message key="title"/></title>
<link rel="icon" href="/img/favicon.ico" type="image/x-icon"/>
<link rel="shortcut icon" href="/img/favicon.ico" type="image/x-icon"/>
<jsp:include page="/include/style.html"/>
<base href="<%= base + "/" + language %>/">
</head>
<body>
<jsp:include page="<%= language + "/include/header.html"%>"/>
- <form name=search action="/search.jsp" method=get>
- <input name=query size=44 value="<%=htmlQueryString%>">
- <input type=hidden name=hitsPerPage value=<%=hitsPerPage%>>
- <input type=submit value="<i18n:message key="search"/>">
+ <form name="search" action="/search.jsp" method="get">
+ <input name="query" size=44 value="<%=htmlQueryString%>">
+ <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
+ <input type="submit" value="<i18n:message key="search"/>">
</form>
<%
// perform query
Hits hits = bean.search(query, start + hitsPerPage);
int end = (int)Math.min(hits.getTotal(), start + hitsPerPage);
int length = end-start;
Hit[] show = hits.getHits(start, length);
@@ -89,30 +105,30 @@
<br><br><b>
<a href="<%=url%>"><%=Entities.encode(title)%></a>
</b>
<% if (!"".equals(summary)) { %>
<br><%=summary%>
<% } %>
<br>
- <font color=#996600><%=Entities.encode(url)%></font>
+ <span class="url"><%=Entities.encode(url)%></span>
(<a href="/cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
(<a
href="/explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString)%>"><i18n:message
key="explain"/></a>)
(<a href="/anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
<% } %>
<%
bean.LOG.info("done displaying hits");
if (end < hits.getTotal()) { // insert next page button
%>
- <form name=search action="/search.jsp" method=get>
- <input type=hidden name=query value="<%=htmlQueryString%>">
- <input type=hidden name=start value=<%=end%>>
- <input type=hidden name=hitsPerPage value=<%=hitsPerPage%>>
- <input type=submit value=<i18n:message key="next"/>>
+ <form name="search" action="/search.jsp" method="get">
+ <input type="hidden" name="query" value="<%=htmlQueryString%>">
+ <input type="hidden" name="start" value="<%=end%>">
+ <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
+ <input type="submit" value="<i18n:message key="next"/>">
</form>
<%
}
%>
<a href="http://www.nutch.org/">
<img border="0" src="/img/poweredbynutch_01.gif">