Problem in Parsing xml with Korean Characters

Sereena Tue, 18 Apr 2006 07:00:52 -0700

I am trying to parse an xml with Korean characters in it, but when some of the 
korean characters are encountered, the parsing stops. If I remove the 
characters causing problem, the rest of the xml is also parsed. Could anyone 
help me to get this solved so that I can parse the whole xml with any korean 
character in it?
Please note that I am not getting any exception here, but the parsing stops.


The code would look like this : 

File data = new File("E://Folder1..//trial1.xml");
                                int fileSize = (int) data.length();
                                FileInputStream file = new FileInputStream
(data);
                                byte[] data2 = new byte[fileSize];
                                
                                
                                for(int i=0; i < fileSize; i++ ) 
                                {
                                                data2[i] = (byte) file.read();
//                                              System.out.println(data2[i]);
                                }
                                                        
                                file.close(); 
                                DocumentBuilderFactory dbf = 
DocumentBuilderFactory.newInstance();
                                DocumentBuilder db = dbf.newDocumentBuilder();
                                doc = db.parse(new InputSource(new 
ByteArrayInputStream(data2)));

//The following is to get the document in string format
                                System.out.println("Reconverting");             
                                byte [] removeResult=document2bytes
(doc.getDocumentElement());
                                String result = new String(removeResult);
                                System.out.println("Result =" + result);

                                System.out.println(encodingString("utf-8","iso-
8859-1",result));


public static byte[] document2bytes(Node node) {
                                try {
                                        Source source = new DOMSource(node);
                                        ByteArrayOutputStream out = new 
ByteArrayOutputStream();
                                        StringWriter stringWriter = new 
StringWriter();
                                        Result result = new StreamResult(out);
                                        TransformerFactory factory = 
TransformerFactory.newInstance();
                                        Transformer transformer = 
factory.newTransformer();
                                        transformer.transform(source, result);
                                        return out.toByteArray();
                                } catch (TransformerConfigurationException e) {
                                        e.printStackTrace();
                                } catch (TransformerException e) {
                                        e.printStackTrace();
                                }
                                return null;
        }

public static String encodingString(String fromEnc, String toEnc, String value)
                        throws IOException {
                        if (value != null) {
                                if ("iso-8859-1".equals(toEnc)) {
                                        System.out.println("[encodeString] 
value from static table cell element " + value);
                                        value = new String(value.getBytes
(), "UTF-8");
                                        System.out.println("[encodeString] 
Before encoding " + value);
                                        value = escapingNCR(value, false);
                                        System.out.println(" [encodeString] 
After encoding NCR " + value);
                                }
                                else {
                                        System.out.println("[encodeString] 
Before encoding " + value);
                                        ByteArrayInputStream bis = new 
ByteArrayInputStream(value.getBytes());
                                        ByteArrayOutputStream bos = new 
ByteArrayOutputStream();
                                        // Set up character stream
                                        Reader r = new BufferedReader(new 
InputStreamReader(bis, fromEnc));
                                        Writer w = new BufferedWriter(new 
OutputStreamWriter(bos, toEnc));
                                
                                        char[] buffer = new char[4096];
                                        int len;
                                        while ((len = r.read(buffer)) != -1)
                                                w.write(buffer, 0, len);
                                        r.close();
                                        w.flush();
                                        w.close();
                                        value = bos.toString();
                                        System.out.println("[encodeString] 
After encoding " + value);
                                }
                        }
                        return value;
                }


public static String escapingNCR(String str, boolean escapeAscii) 
                {
                   String ostr = new String();

                   for(int i=0; i<str.length(); i++) {

                          char ch = str.charAt(i);
                          //System.out.println(new String(new char[]{ch}));     
        
                          if (!escapeAscii && ((ch >= 0x0020) && (ch <= 
0x007e)) || specialSaveChars.indexOf(ch) >= 0) {
                                ostr += ch ;
                          }else {
                                ostr += "&#x" ;
                                String hex = Integer.toHexString(str.charAt(i) 
& 0xFFFF);
                                if (hex.length() == 2) {
                                        ostr += "00" ;
                                }
                                ostr += hex.toUpperCase(Locale.ENGLISH);
                                ostr += ";";
                          }
                   }

                   return (ostr);
                }


The xml 'trial1.xml' that I parse could look like this:

---------------------------------------------------
<?xml version="1.0" encoding="euc-kr"?>
<TrialXML>ÇÁ·©Å¬¸° ¾î±×·¹½Ãºê ±×·Î½º </TrialXML>
---------------------------------------------------





---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Problem in Parsing xml with Korean Characters

Reply via email to