On Sat, 17 Jul 2004, Jungshik Shin wrote:

> Jungshik Shin wrote:
>
> >
> > Attached is a patch for bug 974883 (handling query strings outside
> > Latin-1/ISO-8859-1 repertoire). Somehow I can't attach it at the
> > sourceforge bug tracking system.
> > It should work, but somehow it still doesn't work.
>
> With  directives at the beginning of JSP files (e.g. search.jsp) shown
> below,
>
> <%@ page
>   contentType="text/html; charset=UTF-8"
>  pageEncoding="UTF-8"
>  ..... %>,
>
> everything should be fine. Unfortunately, something very strange is
> going on and it only works
.....
>  From the W3C International mailing list, I got to know a part of the
> picture (using JSTL - nutch web front-end uses apache i18n) can override
> 'contentType' page directive. So can 'custom filters'.


Upgrading to tomcat 5.x (or any other container implementing JSP
2.0/servlet 2.4 spec) would solve the problem. However, not everyone
uses such a container so that I added a work-around mentioned at

  http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
(see the last few lines of the section 'Handling Web Page Encoding')

With this patch, any Unicode character can be used in a query. It'd be
nice to see this checked in, which is a step closer to Nutch being a
well-I18Nized search engine.

Jungshik

P.S. At http://pippin.kaist.ac.kr:8080, you can test it.

? src/web/jsp/search.jsp?query=1298
Index: src/web/include/style.html
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/include/style.html,v
retrieving revision 1.4
diff -u -7 -p -r1.4 style.html
--- src/web/include/style.html  3 Feb 2004 22:06:45 -0000       1.4
+++ src/web/include/style.html  17 Jul 2004 04:21:13 -0000
@@ -6,8 +6,9 @@
 .bodytext {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: 
#000000; text-decoration: none}
 .title {  font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: #FF9900; 
text-decoration: none}
 .intro {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #FF9900; 
text-decoration: none}
 .orangeTd {background-color: #FF9900}
 ul {list-style-image: url(../img/reiter/ul.gif)}
 h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #000000;}
 h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #000000;}
+.url {color: #996600;}
 </style>
Index: src/web/jsp/anchors.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/anchors.jsp,v
retrieving revision 1.9
diff -u -7 -p -r1.9 anchors.jsp
--- src/web/jsp/anchors.jsp     5 Sep 2003 21:01:47 -0000       1.9
+++ src/web/jsp/anchors.jsp     17 Jul 2004 04:21:13 -0000
@@ -1,29 +1,44 @@
-<%@ page
+<%@ page 
+  contentType="text/html; charset=UTF-8"
+  pageEncoding="UTF-8"
+
   import="javax.servlet.*"
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
 
   import="net.nutch.html.Entities"
   import="net.nutch.searcher.*"
 %><%
   NutchBean bean = NutchBean.get(application);
+  // set the character encoding to use when interpreting request values 
+  request.setCharacterEncoding("UTF-8");
   bean.LOG.info("anchors request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
                     Integer.parseInt(request.getParameter("id")), 0.0f);
   HitDetails details = bean.getDetails(hit);
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.anchors", request.getLocale())
     .getLocale().getLanguage();
   String requestURI = HttpUtils.getRequestURL(request).toString();
   String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<%
+  // To prevent the character encoding declared with 'contentType' page
+  // directive from being overriden by JSTL (apache i18n), we freeze it
+  // by flushing the output buffer. 
+  // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+  out.flush();
+%>
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.anchors"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <head>
 <title>Nutch: <i18n:message key="title"/></title>
 <jsp:include page="/include/style.html"/>
 <base href="<%= base + "/" + language + "/" %>">
 </head>
 
 <body>
Index: src/web/jsp/explain.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/explain.jsp,v
retrieving revision 1.8
diff -u -7 -p -r1.8 explain.jsp
--- src/web/jsp/explain.jsp     5 Sep 2003 21:01:47 -0000       1.8
+++ src/web/jsp/explain.jsp     17 Jul 2004 04:21:13 -0000
@@ -1,28 +1,43 @@
-<%@ page
+<%@ page 
+  contentType="text/html; charset=UTF-8"
+  pageEncoding="UTF-8" 
+
   import="javax.servlet.*"
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
   import="net.nutch.searcher.*"
 %><%
   NutchBean bean = NutchBean.get(application);
+  // set the character encoding to use when interpreting request values 
+  request.setCharacterEncoding("UTF-8");
   bean.LOG.info("explain request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
                     Integer.parseInt(request.getParameter("id")), 0.0f);
   HitDetails details = bean.getDetails(hit);
   Query query = Query.parse(request.getParameter("query"));
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.explain", request.getLocale())
     .getLocale().getLanguage();
   String requestURI = HttpUtils.getRequestURL(request).toString();
   String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<%
+  // To prevent the character encoding declared with 'contentType' page
+  // directive from being overriden by JSTL (apache i18n), we freeze it
+  // by flushing the output buffer. 
+  // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+  out.flush();
+%>
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.explain"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <head>
 <title>Nutch: <i18n:message key="title"/></title>
 <jsp:include page="/include/style.html"/>
 <base href="<%= base  + "/" + language %>/">
 </head>
 
 <body>
Index: src/web/jsp/search.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/search.jsp,v
retrieving revision 1.24
diff -u -7 -p -r1.24 search.jsp
--- src/web/jsp/search.jsp      20 May 2004 20:27:16 -0000      1.24
+++ src/web/jsp/search.jsp      17 Jul 2004 04:21:13 -0000
@@ -1,18 +1,24 @@
-<%@ page
+<%@ page 
+  contentType="text/html; charset=UTF-8"
+  pageEncoding="UTF-8"
+
   import="javax.servlet.*"
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
   import="java.net.*"
 
   import="net.nutch.html.Entities"
   import="net.nutch.searcher.*"
 %><%
   NutchBean bean = NutchBean.get(application);
+  // set the character encoding to use when interpreting request values 
+  request.setCharacterEncoding("UTF-8");
+
   bean.LOG.info("query request from " + request.getRemoteAddr());
 
   // get query from request
   String queryString = request.getParameter("query");
   if (queryString == null)                       
     throw new ServletException("no query specified");
   String htmlQueryString = 
net.nutch.html.Entities.encode(request.getParameter("query"));
@@ -32,33 +38,43 @@
   bean.LOG.info("query: " + queryString);
 
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.search", request.getLocale())
     .getLocale().getLanguage();
   String requestURI = HttpUtils.getRequestURL(request).toString();
   String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<%
+  // To prevent the character encoding declared with 'contentType' page
+  // directive from being overriden by JSTL (apache i18n), we freeze it
+  // by flushing the output buffer. 
+  // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+  out.flush();
+%>
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.search"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <head>
 <title>Nutch: <i18n:message key="title"/></title>
 <link rel="icon" href="/img/favicon.ico" type="image/x-icon"/>
 <link rel="shortcut icon" href="/img/favicon.ico" type="image/x-icon"/>
 <jsp:include page="/include/style.html"/>
 <base href="<%= base  + "/" + language %>/">
 </head>
 
 <body>
 
 <jsp:include page="<%= language + "/include/header.html"%>"/>
 
- <form name=search action="/search.jsp" method=get>
- <input name=query size=44 value="<%=htmlQueryString%>">
- <input type=hidden name=hitsPerPage value=<%=hitsPerPage%>>
- <input type=submit value="<i18n:message key="search"/>">
+ <form name="search" action="/search.jsp" method="get">
+ <input name="query" size=44 value="<%=htmlQueryString%>">
+ <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
+ <input type="submit" value="<i18n:message key="search"/>">
  </form>
 <%
    // perform query
    Hits hits = bean.search(query, start + hitsPerPage);
    int end = (int)Math.min(hits.getTotal(), start + hitsPerPage);
    int length = end-start;
    Hit[] show = hits.getHits(start, length);
@@ -89,30 +105,30 @@
     <br><br><b>
     <a href="<%=url%>"><%=Entities.encode(title)%></a>
     </b>
     <% if (!"".equals(summary)) { %>
     <br><%=summary%>
     <% } %>
     <br>
-    <font color=#996600><%=Entities.encode(url)%></font>
+    <span class="url"><%=Entities.encode(url)%></span>
     (<a href="/cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
     (<a 
href="/explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString)%>"><i18n:message 
key="explain"/></a>)
     (<a href="/anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
 <% } %>
 
 <%
   bean.LOG.info("done displaying hits");
 
   if (end < hits.getTotal()) {                   // insert next page button
 %>
-    <form name=search action="/search.jsp" method=get>
-    <input type=hidden name=query value="<%=htmlQueryString%>">
-    <input type=hidden name=start value=<%=end%>>
-    <input type=hidden name=hitsPerPage value=<%=hitsPerPage%>>
-    <input type=submit value=<i18n:message key="next"/>>
+    <form name="search" action="/search.jsp" method="get">
+    <input type="hidden" name="query" value="<%=htmlQueryString%>">
+    <input type="hidden" name="start" value="<%=end%>">
+    <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
+    <input type="submit" value="<i18n:message key="next"/>">
     </form>
 <%
     }
 %>
 
 <a href="http://www.nutch.org/";>
 <img border="0" src="/img/poweredbynutch_01.gif">

Reply via email to