Re: [PATCH] Charset-NG

Paul P Komkoff Jr Thu, 02 Jun 2005 07:35:14 -0700

Replying to Alexander Malysh:
> Hi Paul,
> 
> only one objection from my side: please fix indentation. Please use 4 
> spaces instead of 2 as indent.
> 
> When no objections from others and you fix indents I will happily commit 
> this patch.


done

(also take a look at
http://anna.sgu.ru/viewmtn/headofbranch.psp?branch=net.stingr.kannel.generic
)

-- 
Paul P 'Stingray' Komkoff Jr // http://stingr.net/key <- my pgp key
 This message represents the official view of the voices in my head

# 
# patch "gw/wap-appl.c"
#  from [6698038f9ba8d8227a90f72928dcbe1cbf2c2668]
#    to [20be34e1d890e0b7c974590165eb420ea308bf2b]
# 
# patch "gw/wml_compiler.c"
#  from [727a5ea75763c1cd18d8d8aaa1de3f2f4d55bc45]
#    to [211c041170a009bf497c2ae626c0b0a7a107bde2]
# 
--- gw/wap-appl.c
+++ gw/wap-appl.c
@@ -510,15 +510,8 @@
  * to handle those charsets for all content types, just WML/XHTML. */
 static void add_charset_headers(List *headers) 
 {
-    long i, len;
-    
-    gw_assert(charsets != NULL);
-    len = gwlist_len(charsets);
-    for (i = 0; i < len; i++) {
-        unsigned char *charset = octstr_get_cstr(gwlist_get(charsets, i));
-        if (!http_charset_accepted(headers, charset))
-            http_header_add(headers, "Accept-Charset", charset);
-    }
+    if (!http_charset_accepted(headers, "utf-8"))
+        http_header_add(headers, "Accept-Charset", "utf-8");
 }
 
 
@@ -707,6 +700,36 @@
 }
 
 
+static void normalize_charset(struct content * content, List* device_headers) {
+    Octstr* charset;
+
+    if ((charset = find_charset_encoding(content->body)) == NULL)
+        if (octstr_len(content->charset) > 0) {
+            charset = octstr_duplicate(content->charset);
+        } else {
+            charset = octstr_imm("UTF-8");
+        }
+
+    debug("wap-appl",0,"Normalizing charset from %s", 
octstr_get_cstr(charset));
+
+    if (octstr_case_compare(charset, octstr_imm("UTF-8")) != 0 &&
+      !http_charset_accepted(device_headers, octstr_get_cstr(charset))) {
+        if (!http_charset_accepted(device_headers, "UTF-8")) {
+            warning(0, "WSP: Device doesn't support charset <%s> neither 
UTF-8",
+              octstr_get_cstr(charset));
+        } else {
+            debug("wsp",0,"Converting wml/xhtml from charset <%s> to UTF-8",
+              octstr_get_cstr(charset));
+            if (charset_convert(content->body,
+              octstr_get_cstr(charset), "UTF-8") >= 0) {
+                octstr_destroy(content->charset);
+                content->charset = octstr_create("UTF-8");
+            }
+        }
+    }
+    octstr_destroy(charset);
+}
+
 /*
  * Return an HTTP reply back to the phone.
  */
@@ -848,54 +871,8 @@
         if (octstr_search(content.type, octstr_imm("text/vnd.wap.wml"), 0) >= 
0 || 
             octstr_search(content.type, octstr_imm("application/xhtml+xml"), 
0) >= 0 ||
             octstr_search(content.type, 
octstr_imm("application/vnd.wap.xhtml+xml"), 0) >= 0) {
-            Octstr *charset;
-            
-            /* get charset used in content body, default to utf-8 if not 
present */
-            if ((charset = find_charset_encoding(content.body)) == NULL)
-                charset = octstr_imm("UTF-8"); 
 
-            /* convert to utf-8 if original charset is not utf-8 
-             * and device supports it */
-
-            if (octstr_case_compare(charset, octstr_imm("UTF-8")) < 0 &&
-                !http_charset_accepted(device_headers, 
octstr_get_cstr(charset))) {
-                if (!http_charset_accepted(device_headers, "UTF-8")) {
-                    warning(0, "WSP: Device doesn't support charset <%s> 
neither UTF-8", 
-                                octstr_get_cstr(charset));
-                } else {
-                    /* convert to utf-8 */
-                    debug("wsp",0,"Converting wml/xhtml from charset <%s> to 
UTF-8", 
-                          octstr_get_cstr(charset));
-                    if (charset_convert(content.body, 
-                                        octstr_get_cstr(charset), "UTF-8") >= 
0) {
-                        octstr_destroy(content.charset);
-                        content.charset = octstr_create("UTF-8");
-                        /* XXX it might be good idea to change 
<?xml...encoding?> */
-                    }
-                 }
-            }
- 
-            /* convert to iso-8859-1 if original charset is not iso 
-             * and device supports it */
-            else if (octstr_case_compare(charset, octstr_imm("ISO-8859-1")) < 
0 &&
-                    !http_charset_accepted(device_headers, 
octstr_get_cstr(charset))) {
-                if (!http_charset_accepted(device_headers, "ISO-8859-1")) {
-                    warning(0, "WSP: Device doesn't support charset <%s> 
neither ISO-8859-1", 
-                            octstr_get_cstr(charset));
-                } else {
-                    /* convert to iso-latin1 */
-                    debug("wsp",0,"Converting wml/xhtml from charset <%s> to 
ISO-8859-1", 
-                          octstr_get_cstr(charset));
-                    if (charset_convert(content.body, 
-                                        octstr_get_cstr(charset), 
"ISO-8859-1") >= 0) {
-                        octstr_destroy(content.charset);
-                        content.charset = octstr_create("ISO-8859-1");
-                        /* XXX it might be good idea to change 
<?xml...encoding?> */
-                    }
-                }
-            }
-
-            octstr_destroy(charset);
+            normalize_charset(&content, device_headers);
         }
 
         /* set WBXML Encoding-Version for wml->wmlc conversion */
--- gw/wml_compiler.c
+++ gw/wml_compiler.c
@@ -335,7 +335,6 @@
     xmlDocPtr pDoc = NULL;
     char *wml_c_text;
     wml_binary_t *wbxml = NULL;
-    Octstr *encoding = NULL;
 
     *wml_binary = octstr_create("");
     wbxml = wml_binary_create();
@@ -347,63 +346,44 @@
        -- tuo */
     parse_entities(wml_text);
 
-    /* transcode from charset to UTF-8 */
-    if (charset && octstr_len(charset) && 
-        octstr_case_compare(charset, octstr_imm("UTF-8")) == -1) {
-        debug("wml_compile", 0, "WML compiler: Transcoding from <%s> to 
UTF-8", 
-              octstr_get_cstr(charset));
-        set_charset(wml_text, charset);
-    }
-
-    /* 
-     * If we did not set the character set encoding yet, then obviously
-     * there was no charset argument in the Content-Type HTTP reply header.
-     * We have to scan the xml preamble line for an explicite encoding
-     * definition to allow transcoding from UTF-8 to that charset after 
-     * libxml2 did all it's parsing magic. (Keep in mind libxml2 uses UTF-8
-     * as internal encoding.) -- Stipe
-     */
-
-    /* 
-     * We will trust the xml preamble encoding more then the HTTP header 
-     * charset definition.
-     */
-    if ((encoding = find_charset_encoding(wml_text)) != NULL) {
-        /* ok, we rely on the xml preamble encoding */
-    } else if (charset && octstr_len(charset) > 0) {
-        /* we had a HTTP response charset, use this */
-        encoding = octstr_duplicate(charset);
-    } else {
-        /* we had none, so use UTF-8 as default */
-        encoding = octstr_create("UTF-8");
-    }
-
     size = octstr_len(wml_text);
     wml_c_text = octstr_get_cstr(wml_text);
+    debug("ww",0, "given encoding: %s", octstr_get_cstr(charset));
 
     if (octstr_search_char(wml_text, '\0', 0) != -1) {    
         error(0, "WML compiler: Compiling error: "
                  "\\0 character found in the middle of the WML source.");
         ret = -1;
     } else {
-
+#if 0
+        char *tag1, *tag2;
+        tag1 = strchr(wml_c_text, '<');
+        if ((tag1 != NULL) && (tag1 == strstr(wml_c_text, "<?"))) {
+            tag2 = strchr(tag1, '>');
+            if ((tag2 != NULL) && (tag2 - 1 == strstr(tag1, "?>"))) {
+                debug("wml_compile",0,"Stripping preamble");
+                size -= ++tag2 - wml_c_text;
+                wml_c_text = tag2;
+            }
+        }
+#endif
         /* 
          * An empty octet string for the binary output is created, the wml 
          * source is parsed into a parsing tree and the tree is then compiled 
          * into binary.
          */
 
-        pDoc = xmlParseMemory(wml_c_text, size);
+        pDoc = xmlReadMemory(wml_c_text, size, NULL, octstr_get_cstr(charset), 
XML_PARSE_RECOVER | XML_PARSE_NONET);
        
         if (pDoc != NULL) {
             /* 
              * If we have a set internal encoding, then apply this information 
              * to the XML parsing tree document for later transcoding ability.
              */
-            if (encoding)
-                pDoc->charset = 
xmlParseCharEncoding(octstr_get_cstr(encoding));
+            if (charset)
+                pDoc->charset = xmlParseCharEncoding(octstr_get_cstr(charset));
 
-            ret = parse_document(pDoc, encoding, &wbxml, version);
+            ret = parse_document(pDoc, charset, &wbxml, version);
             wml_binary_output(*wml_binary, wbxml);
         } else {    
             error(0, "WML compiler: Compiling error: "
@@ -413,7 +393,6 @@
     }
 
     wml_binary_destroy(wbxml);
-    octstr_destroy(encoding);
 
     if (pDoc) 
         xmlFreeDoc(pDoc);

Re: [PATCH] Charset-NG

Reply via email to