Hi,

Attached is a patch for bug 974883 (handling query strings outside Latin-1/ISO-8859-1 repertoire). Somehow I can't attach it at the sourceforge bug tracking system.
It should work, but somehow it still doesn't work. Nonetheless, this patch is moving in the right direction
regarding the standard compliance (html).


Even with this change, 'search.jsp' is not compliant to the standard because 'header.html'
automatically generated from 'header.xml' with XSLT has


<?xml version="1.0" encoding="UTF-8"?>

at the very beginning.

Because 'header.html' is included after '<body>' appears in search.jsp, the html generated from
search.jsp as a whole is malformed and invalid. Therefore, after the transformation with XSLT,
we have to get rid of the above line. I know so little about 'ant' that I can't fix it at the moment, but
it'd be nice if someone else can fix this problem.



Jungshik

P.S. For recent versions of apache tomcat, 'useBodyEncodingForURI' has to be set to 'true'
in server.xml like this:


<Connector port="8080" maxThreads="150" minSpareThreads="25"
maxSpareThreads="75" enableLookups="false" redirectPort="8443"
acceptCount="100" debug="0" connectionTimeout="20000"
              disableUploadTimeout="true"
              useBodyEncodingForURI="true">

See http://www.mail-archive.com/[EMAIL PROTECTED]/msg50822.html


Index: src/web/include/style.html
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/include/style.html,v
retrieving revision 1.4
diff -u -7 -p -r1.4 style.html
--- src/web/include/style.html	3 Feb 2004 22:06:45 -0000	1.4
+++ src/web/include/style.html	16 Jul 2004 15:00:13 -0000
@@ -6,8 +6,9 @@
 .bodytext {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #000000; text-decoration: none}
 .title {  font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: #FF9900; text-decoration: none}
 .intro {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #FF9900; text-decoration: none}
 .orangeTd {background-color: #FF9900}
 ul {list-style-image: url(../img/reiter/ul.gif)}
 h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #000000;}
 h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #000000;}
+.url {color: #996600;}
 </style>
Index: src/web/jsp/anchors.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/anchors.jsp,v
retrieving revision 1.9
diff -u -7 -p -r1.9 anchors.jsp
--- src/web/jsp/anchors.jsp	5 Sep 2003 21:01:47 -0000	1.9
+++ src/web/jsp/anchors.jsp	16 Jul 2004 15:00:14 -0000
@@ -1,29 +1,36 @@
-<%@ page
+<%@ page 
+  contentType="text/html; charset=UTF-8"
+  pageEncoding="UTF-8"
+
   import="javax.servlet.*"
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
 
   import="net.nutch.html.Entities"
   import="net.nutch.searcher.*"
 %><%
   NutchBean bean = NutchBean.get(application);
+  // set the character encoding to use when interpreting request values 
+  request.setCharacterEncoding("UTF-8");
   bean.LOG.info("anchors request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
                     Integer.parseInt(request.getParameter("id")), 0.0f);
   HitDetails details = bean.getDetails(hit);
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.anchors", request.getLocale())
     .getLocale().getLanguage();
   String requestURI = HttpUtils.getRequestURL(request).toString();
   String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.anchors"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <head>
 <title>Nutch: <i18n:message key="title"/></title>
 <jsp:include page="/include/style.html"/>
 <base href="<%= base + "/" + language + "/" %>">
 </head>
 
 <body>
Index: src/web/jsp/explain.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/explain.jsp,v
retrieving revision 1.8
diff -u -7 -p -r1.8 explain.jsp
--- src/web/jsp/explain.jsp	5 Sep 2003 21:01:47 -0000	1.8
+++ src/web/jsp/explain.jsp	16 Jul 2004 15:00:14 -0000
@@ -1,28 +1,35 @@
-<%@ page
+<%@ page 
+  contentType="text/html; charset=UTF-8"
+  pageEncoding="UTF-8" 
+
   import="javax.servlet.*"
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
   import="net.nutch.searcher.*"
 %><%
   NutchBean bean = NutchBean.get(application);
+  // set the character encoding to use when interpreting request values 
+  request.setCharacterEncoding("UTF-8");
   bean.LOG.info("explain request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
                     Integer.parseInt(request.getParameter("id")), 0.0f);
   HitDetails details = bean.getDetails(hit);
   Query query = Query.parse(request.getParameter("query"));
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.explain", request.getLocale())
     .getLocale().getLanguage();
   String requestURI = HttpUtils.getRequestURL(request).toString();
   String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.explain"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <head>
 <title>Nutch: <i18n:message key="title"/></title>
 <jsp:include page="/include/style.html"/>
 <base href="<%= base  + "/" + language %>/">
 </head>
 
 <body>
Index: src/web/jsp/search.jsp
===================================================================
RCS file: /cvsroot/nutch/nutch/src/web/jsp/search.jsp,v
retrieving revision 1.24
diff -u -7 -p -r1.24 search.jsp
--- src/web/jsp/search.jsp	20 May 2004 20:27:16 -0000	1.24
+++ src/web/jsp/search.jsp	16 Jul 2004 15:00:14 -0000
@@ -1,18 +1,24 @@
-<%@ page
+<%@ page 
+  contentType="text/html; charset=UTF-8"
+  pageEncoding="UTF-8"
+
   import="javax.servlet.*"
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
   import="java.net.*"
 
   import="net.nutch.html.Entities"
   import="net.nutch.searcher.*"
 %><%
   NutchBean bean = NutchBean.get(application);
+  // set the character encoding to use when interpreting request values 
+  request.setCharacterEncoding("UTF-8");
+
   bean.LOG.info("query request from " + request.getRemoteAddr());
 
   // get query from request
   String queryString = request.getParameter("query");
   if (queryString == null)			  
     throw new ServletException("no query specified");
   String htmlQueryString = net.nutch.html.Entities.encode(request.getParameter("query"));
@@ -32,33 +38,36 @@
   bean.LOG.info("query: " + queryString);
 
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.search", request.getLocale())
     .getLocale().getLanguage();
   String requestURI = HttpUtils.getRequestURL(request).toString();
   String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><html>
+%>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 <%@ taglib uri="http://jakarta.apache.org/taglibs/i18n"; prefix="i18n" %>
 <i18n:bundle baseName="org.nutch.jsp.search"/>
+<html lang="<%= language %>">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <head>
 <title>Nutch: <i18n:message key="title"/></title>
 <link rel="icon" href="/img/favicon.ico" type="image/x-icon"/>
 <link rel="shortcut icon" href="/img/favicon.ico" type="image/x-icon"/>
 <jsp:include page="/include/style.html"/>
 <base href="<%= base  + "/" + language %>/">
 </head>
 
 <body>
 
 <jsp:include page="<%= language + "/include/header.html"%>"/>
 
- <form name=search action="/search.jsp" method=get>
- <input name=query size=44 value="<%=htmlQueryString%>">
- <input type=hidden name=hitsPerPage value=<%=hitsPerPage%>>
- <input type=submit value="<i18n:message key="search"/>">
+ <form name="search" action="/search.jsp" method="get">
+ <input name="query" size=44 value="<%=htmlQueryString%>">
+ <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
+ <input type="submit" value="<i18n:message key="search"/>">
  </form>
 <%
    // perform query
    Hits hits = bean.search(query, start + hitsPerPage);
    int end = (int)Math.min(hits.getTotal(), start + hitsPerPage);
    int length = end-start;
    Hit[] show = hits.getHits(start, length);
@@ -89,30 +98,30 @@
     <br><br><b>
     <a href="<%=url%>"><%=Entities.encode(title)%></a>
     </b>
     <% if (!"".equals(summary)) { %>
     <br><%=summary%>
     <% } %>
     <br>
-    <font color=#996600><%=Entities.encode(url)%></font>
+    <span class="url"><%=Entities.encode(url)%></span>
     (<a href="/cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
     (<a href="/explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString)%>"><i18n:message key="explain"/></a>)
     (<a href="/anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
 <% } %>
 
 <%
   bean.LOG.info("done displaying hits");
 
   if (end < hits.getTotal()) {			  // insert next page button
 %>
-    <form name=search action="/search.jsp" method=get>
-    <input type=hidden name=query value="<%=htmlQueryString%>">
-    <input type=hidden name=start value=<%=end%>>
-    <input type=hidden name=hitsPerPage value=<%=hitsPerPage%>>
-    <input type=submit value=<i18n:message key="next"/>>
+    <form name="search" action="/search.jsp" method="get">
+    <input type="hidden" name="query" value="<%=htmlQueryString%>">
+    <input type="hidden" name="start" value="<%=end%>">
+    <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
+    <input type="submit" value="<i18n:message key="next"/>">
     </form>
 <%
     }
 %>
 
 <a href="http://www.nutch.org/";>
 <img border="0" src="/img/poweredbynutch_01.gif">

Reply via email to