Re: HttpClient UTF-8 problem !!!

Ortwin Glďż˝ck Thu, 18 Sep 2003 01:15:06 -0700

Eric Chow wrote:

============================================================================
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.*;
import org.apache.commons.httpclient.cookie.*;
import org.apache.commons.httpclient.util.*;
import java.io.*;
import java.util.*;


public class TradSimUnicode {
   private static String checkURL = "cgibin.erols.com";

public static void main(String argv[]) {

HttpClient client = new HttpClient();

try {

         HostConfiguration hc = new HostConfiguration();
         hc.setHost(checkURL, 80, "http");
         client.setHostConfiguration(hc);
         client.getState().setCookiePolicy(CookiePolicy.COMPATIBILITY);

check(client, '\u4e0e');

      } catch(Exception e) {
         e.printStackTrace();
      }
   }

public static void check(HttpClient client, char c) throws Exception {

      String code = toFormat(c);
      String s = unicodeToString(code);


      PostMethod post = new
PostMethod("/mandarintools/cgi-bin/charlook.pl");

      // Prepare login parameters
      NameValuePair v1     = new NameValuePair("searchmode", "standard");
      NameValuePair v2     = new NameValuePair("printtype", "utf8");
      NameValuePair v3     = new NameValuePair("chartype", "trad");
      NameValuePair v4     = new NameValuePair("ordering", "frequency");
      NameValuePair v5     = new NameValuePair("display", "char");
      NameValuePair v6     = new NameValuePair("display", "variants");
      NameValuePair v7     = new NameValuePair("display", "unicode");
      NameValuePair v8     = new NameValuePair("enctype", "utf8");
      NameValuePair v9     = new NameValuePair("whatchar", s);
      NameValuePair v10    = new NameValuePair("searchchar", "Search by
Character");


      NameValuePair[] valPairs = { v1, v2, v3, v4, v5, v6, v7, v8, v9,
v10 };

post.setRequestBody(valPairs);

client.executeMethod(post);

String resp = post.getResponseBodyAsString();

System.out.write(resp.getBytes("UTF-8"));

      post.releaseConnection();
   }

   public static String unicodeToString(String unicodeString) {
  if (unicodeString == null) {
   return null;
  }

StringBuffer buf = new StringBuffer();

StringTokenizer tokens = new StringTokenizer(unicodeString, "\\u");

  while(tokens.hasMoreTokens()) {
   String token = (String)tokens.nextToken();

char oneUnicodeChar = (char)Integer.parseInt(token, 16);

   buf.append(oneUnicodeChar);
  }

  return buf.toString();
 }

 public static String toFormat(int n){
  String zeros = "000";
  String body = Integer.toHexString(n);
  return "\\u" + zeros.substring(0, 4-body.length()) + body;
 }
}

============================================================

Eric,

I think you have not fully understood how Java handles Unicode. Basically you can get completely rid of your two methods unicodeToString and toFormat and just pass in the character as you are doing now without touching it:

NameValuePair v9 = new NameValuePair("whatchar", String.valueOf(c));

By default HttpClient uses ISO-8859-1 as the encoding for parameters passed into the POST method (which is not consistent with the GET method where UTF-8 is used by default by the way). If you whish to use UTF-8 you must set the Content-Type header manually:

post.addRequestHeader("Content-Type", FORM_URL_ENCODED_CONTENT_TYPE+"; charset=UTF-8");

hope that helps

Odi


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: HttpClient UTF-8 problem !!!

Reply via email to