Replying to Jonathan Houser:
Well, I did something. It survives only basic testing, and of course
not complete yet, but I'll post it here - maybe someone will say
something.
P.S. add octstr_destroy(charset) at the end of normalize manually :)
--
Paul P 'Stingray' Komkoff Jr // http://stingr.net/key <- my pgp key
This message represents the official view of the voices in my head
--- gateway.orig/gw/wml_compiler.c 2005-02-11 23:58:53.000000000 +0300
+++ gateway.C9/gw/wml_compiler.c 2005-05-01 18:51:12.657859082 +0400
@@ -335,7 +335,6 @@
xmlDocPtr pDoc = NULL;
char *wml_c_text;
wml_binary_t *wbxml = NULL;
- Octstr *encoding = NULL;
*wml_binary = octstr_create("");
wbxml = wml_binary_create();
@@ -347,37 +346,6 @@
-- tuo */
parse_entities(wml_text);
- /* transcode from charset to UTF-8 */
- if (charset && octstr_len(charset) &&
- octstr_case_compare(charset, octstr_imm("UTF-8")) == -1) {
- debug("wml_compile", 0, "WML compiler: Transcoding from <%s> to
UTF-8",
- octstr_get_cstr(charset));
- set_charset(wml_text, charset);
- }
-
- /*
- * If we did not set the character set encoding yet, then obviously
- * there was no charset argument in the Content-Type HTTP reply header.
- * We have to scan the xml preamble line for an explicite encoding
- * definition to allow transcoding from UTF-8 to that charset after
- * libxml2 did all it's parsing magic. (Keep in mind libxml2 uses UTF-8
- * as internal encoding.) -- Stipe
- */
-
- /*
- * We will trust the xml preamble encoding more then the HTTP header
- * charset definition.
- */
- if ((encoding = find_charset_encoding(wml_text)) != NULL) {
- /* ok, we rely on the xml preamble encoding */
- } else if (charset && octstr_len(charset) > 0) {
- /* we had a HTTP response charset, use this */
- encoding = octstr_duplicate(charset);
- } else {
- /* we had none, so use UTF-8 as default */
- encoding = octstr_create("UTF-8");
- }
-
size = octstr_len(wml_text);
wml_c_text = octstr_get_cstr(wml_text);
@@ -393,17 +361,17 @@
* into binary.
*/
- pDoc = xmlParseMemory(wml_c_text, size);
+ pDoc = xmlReadMemory(wml_c_text, size, NULL, octstr_get_cstr(charset),
XML_PARSE_RECOVER | XML_PARSE_NONET);
if (pDoc != NULL) {
/*
* If we have a set internal encoding, then apply this information
* to the XML parsing tree document for later transcoding ability.
*/
- if (encoding)
- pDoc->charset =
xmlParseCharEncoding(octstr_get_cstr(encoding));
+ if (charset)
+ pDoc->charset = xmlParseCharEncoding(octstr_get_cstr(charset));
- ret = parse_document(pDoc, encoding, &wbxml, version);
+ ret = parse_document(pDoc, charset, &wbxml, version);
wml_binary_output(*wml_binary, wbxml);
} else {
error(0, "WML compiler: Compiling error: "
@@ -413,7 +381,6 @@
}
wml_binary_destroy(wbxml);
- octstr_destroy(encoding);
if (pDoc)
xmlFreeDoc(pDoc);
Index: gateway.C9/gw/wap-appl.c
===================================================================
--- gateway.C9.orig/gw/wap-appl.c 2005-04-29 09:09:32.000000000 +0400
+++ gateway.C9/gw/wap-appl.c 2005-05-01 18:59:36.177853180 +0400
@@ -523,15 +523,8 @@
* to handle those charsets for all content types, just WML/XHTML. */
static void add_charset_headers(List *headers)
{
- long i, len;
-
- gw_assert(charsets != NULL);
- len = gwlist_len(charsets);
- for (i = 0; i < len; i++) {
- unsigned char *charset = octstr_get_cstr(gwlist_get(charsets, i));
- if (!http_charset_accepted(headers, charset))
- http_header_add(headers, "Accept-Charset", charset);
- }
+ if (!http_charset_accepted(headers, "utf-8"))
+ http_header_add(headers, "Accept-Charset", "utf-8");
}
@@ -720,6 +713,33 @@
}
+static void normalize_charset(struct content * content, List* device_headers) {
+ Octstr* charset;
+
+ if ((charset = find_charset_encoding(content->body)) == NULL)
+ if (octstr_len(content->charset) > 0) {
+ charset = octstr_duplicate(content->charset);
+ } else {
+ charset = octstr_imm("UTF-8");
+ }
+
+ if (octstr_case_compare(charset, octstr_imm("UTF-8")) != 0 &&
+ !http_charset_accepted(device_headers, octstr_get_cstr(charset))) {
+ if (!http_charset_accepted(device_headers, "UTF-8")) {
+ warning(0, "WSP: Device doesn't support charset <%s> neither UTF-8",
+ octstr_get_cstr(charset));
+ } else {
+ debug("wsp",0,"Converting wml/xhtml from charset <%s> to UTF-8",
+ octstr_get_cstr(charset));
+ if (charset_convert(content->body,
+ octstr_get_cstr(charset), "UTF-8") >= 0) {
+ octstr_destroy(content->charset);
+ content->charset = octstr_create("UTF-8");
+ }
+ }
+ }
+}
+
/*
* Return an HTTP reply back to the phone.
*/
@@ -861,54 +881,8 @@
if (octstr_search(content.type, octstr_imm("text/vnd.wap.wml"), 0) >=
0 ||
octstr_search(content.type, octstr_imm("application/xhtml+xml"),
0) >= 0 ||
octstr_search(content.type,
octstr_imm("application/vnd.wap.xhtml+xml"), 0) >= 0) {
- Octstr *charset;
-
- /* get charset used in content body, default to utf-8 if not
present */
- if ((charset = find_charset_encoding(content.body)) == NULL)
- charset = octstr_imm("UTF-8");
-
- /* convert to utf-8 if original charset is not utf-8
- * and device supports it */
-
- if (octstr_case_compare(charset, octstr_imm("UTF-8")) < 0 &&
- !http_charset_accepted(device_headers,
octstr_get_cstr(charset))) {
- if (!http_charset_accepted(device_headers, "UTF-8")) {
- warning(0, "WSP: Device doesn't support charset <%s>
neither UTF-8",
- octstr_get_cstr(charset));
- } else {
- /* convert to utf-8 */
- debug("wsp",0,"Converting wml/xhtml from charset <%s> to
UTF-8",
- octstr_get_cstr(charset));
- if (charset_convert(content.body,
- octstr_get_cstr(charset), "UTF-8") >=
0) {
- octstr_destroy(content.charset);
- content.charset = octstr_create("UTF-8");
- /* XXX it might be good idea to change
<?xml...encoding?> */
- }
- }
- }
-
- /* convert to iso-8859-1 if original charset is not iso
- * and device supports it */
- else if (octstr_case_compare(charset, octstr_imm("ISO-8859-1")) <
0 &&
- !http_charset_accepted(device_headers,
octstr_get_cstr(charset))) {
- if (!http_charset_accepted(device_headers, "ISO-8859-1")) {
- warning(0, "WSP: Device doesn't support charset <%s>
neither ISO-8859-1",
- octstr_get_cstr(charset));
- } else {
- /* convert to iso-latin1 */
- debug("wsp",0,"Converting wml/xhtml from charset <%s> to
ISO-8859-1",
- octstr_get_cstr(charset));
- if (charset_convert(content.body,
- octstr_get_cstr(charset),
"ISO-8859-1") >= 0) {
- octstr_destroy(content.charset);
- content.charset = octstr_create("ISO-8859-1");
- /* XXX it might be good idea to change
<?xml...encoding?> */
- }
- }
- }
- octstr_destroy(charset);
+ normalize_charset(&content, device_headers);
}
/* set WBXML Encoding-Version for wml->wmlc conversion */