git commit: CAMEL-7584: XML-Aware Tokenizer failing with utf-8 multibyte characters

ay Mon, 07 Jul 2014 14:59:49 -0700

Repository: camel
Updated Branches:
  refs/heads/master 14b61640a -> 309ca3b1f



CAMEL-7584: XML-Aware Tokenizer failing with utf-8 multibyte characters


Project: http://git-wip-us.apache.org/repos/asf/camel/repo
Commit: http://git-wip-us.apache.org/repos/asf/camel/commit/309ca3b1
Tree: http://git-wip-us.apache.org/repos/asf/camel/tree/309ca3b1
Diff: http://git-wip-us.apache.org/repos/asf/camel/diff/309ca3b1

Branch: refs/heads/master
Commit: 309ca3b1fe362ddb49daedc596a114421f6cd497
Parents: 14b6164
Author: Akitoshi Yoshida <a...@apache.org>
Authored: Mon Jul 7 23:58:47 2014 +0200
Committer: Akitoshi Yoshida <a...@apache.org>
Committed: Mon Jul 7 23:59:06 2014 +0200

----------------------------------------------------------------------
 .../apache/camel/support/RecordableReader.java  |  94 +++++++++++++++
 .../support/XMLTokenExpressionIterator.java     |  39 +++++--
 .../XMLTokenExpressionIteratorCharsetTest.java  | 116 +++++++++++++++++++
 3 files changed, 242 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/camel/blob/309ca3b1/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java
----------------------------------------------------------------------
diff --git 
a/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java 
b/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java
new file mode 100644
index 0000000..776993b
--- /dev/null
+++ b/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.support;
+
+import java.io.CharArrayWriter;
+import java.io.FilterReader;
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * This class is used by the toknizer to extract data while reading from the 
stream.
+ * REVIST it is used package internally but may be moved to some common 
package.
+ */
+class RecordableReader extends FilterReader {
+    private TrimmableCharArrayWriter buf;
+    private boolean recording;
+    protected RecordableReader(Reader in) {
+        super(in);
+        this.buf = new TrimmableCharArrayWriter();
+        this.recording = true;
+    }
+
+    @Override
+    public int read() throws IOException {
+        int c = super.read();
+        if (c > 0 && recording) {
+            buf.write(c);
+        }
+        return c;
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        int n = super.read(cbuf, off, len);
+        if (n > 0 && recording) {
+            buf.write(cbuf, off, n);
+        }
+        return n;
+    }
+
+    public String getText(int pos) {
+        recording = false;
+        String t = new String(buf.getCharArray(), 0, pos);
+        buf.trim(pos, 0);
+        return t;
+    }
+    
+    public char[] getChars(int pos) {
+        recording = false;
+        char[] b = buf.toCharArray(pos);
+        buf.trim(pos, 0);
+        return b;
+    }
+    
+    public void record() {
+        recording = true;
+    }
+
+    int size() {
+        return buf.size();
+    }
+
+    private static class TrimmableCharArrayWriter extends CharArrayWriter {
+        public void trim(int head, int tail) {
+            System.arraycopy(buf, head, buf, 0, count - head - tail);
+            count -= head + tail;
+        }
+        
+        public char[] toCharArray(int len) {
+            char[] b = new char[len];
+            System.arraycopy(buf, 0, b, 0, len);
+            return b;
+        }
+
+        char[] getCharArray() {
+            return buf;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/camel/blob/309ca3b1/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java
----------------------------------------------------------------------
diff --git 
a/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java
 
b/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java
index a9ca796..e0724f4 100644
--- 
a/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java
+++ 
b/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java
@@ -19,6 +19,9 @@ package org.apache.camel.support;
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -71,8 +74,20 @@ public class XMLTokenExpressionIterator extends 
ExpressionAdapter implements Nam
         this.mode = mode != null ? mode.charAt(0) : 0;
     }
     
-    protected Iterator<?> createIterator(InputStream in, Exchange exchange) 
throws XMLStreamException {
-        XMLTokenIterator iterator = new XMLTokenIterator(path, nsmap, mode, 
in, exchange);
+    protected Iterator<?> createIterator(InputStream in, Exchange exchange) 
throws XMLStreamException, UnsupportedEncodingException {
+        String charset = IOHelper.getCharsetName(exchange, false);
+        Reader reader;
+        if (charset == null) {
+            reader = new InputStreamReader(in);
+        } else {
+            reader = new InputStreamReader(in, charset);
+        }
+        XMLTokenIterator iterator = new XMLTokenIterator(path, nsmap, mode, 
reader);
+        return iterator;
+    }
+
+    protected Iterator<?> createIterator(Reader in) throws XMLStreamException {
+        XMLTokenIterator iterator = new XMLTokenIterator(path, nsmap, mode, 
in);
         return iterator;
     }
 
@@ -112,6 +127,11 @@ public class XMLTokenExpressionIterator extends 
ExpressionAdapter implements Nam
             // must close input stream
             IOHelper.close(in);
             return null;
+        } catch (UnsupportedEncodingException e) {
+            exchange.setException(e);
+            // must close input stream
+            IOHelper.close(in);
+            return null;
         } finally {
             if (closeStream) {
                 IOHelper.close(in);
@@ -127,7 +147,7 @@ public class XMLTokenExpressionIterator extends 
ExpressionAdapter implements Nam
         private AttributedQName[] splitpath;
         private int index;
         private char mode;
-        private RecordableInputStream in;
+        private RecordableReader in;
         private XMLStreamReader reader;
         private List<QName> path;
         private List<Map<String, String>> namespaces;
@@ -141,7 +161,13 @@ public class XMLTokenExpressionIterator extends 
ExpressionAdapter implements Nam
         
         private Object nextToken;
         
-        public XMLTokenIterator(String path, Map<String, String> nsmap, char 
mode, InputStream in, Exchange exchange) throws XMLStreamException {
+        public XMLTokenIterator(String path, Map<String, String> nsmap, char 
mode, InputStream in, String charset) 
+            throws XMLStreamException, UnsupportedEncodingException {
+            // woodstox's getLocation().etCharOffset() does not return the 
offset correctly for InputStream, so use Reader instead.
+            this(path, nsmap, mode, new InputStreamReader(in, charset));
+        }
+
+        public XMLTokenIterator(String path, Map<String, String> nsmap, char 
mode, Reader in) throws XMLStreamException {
             final String[] sl = path.substring(1).split("/");
             this.splitpath = new AttributedQName[sl.length];
             for (int i = 0; i < sl.length; i++) {
@@ -156,9 +182,8 @@ public class XMLTokenExpressionIterator extends 
ExpressionAdapter implements Nam
             }
             
             this.mode = mode != 0 ? mode : 'i';
-            String charset = IOHelper.getCharsetName(exchange, false);
-            this.in = new RecordableInputStream(in, charset);
-            this.reader = new StaxConverter().createXMLStreamReader(this.in, 
exchange);
+            this.in = new RecordableReader(in);
+            this.reader = new StaxConverter().createXMLStreamReader(this.in);
 
             LOG.trace("reader.class: {}", reader.getClass());
             int coff = reader.getLocation().getCharacterOffset();

http://git-wip-us.apache.org/repos/asf/camel/blob/309ca3b1/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java
----------------------------------------------------------------------
diff --git 
a/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java
 
b/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java
new file mode 100644
index 0000000..1d146b6
--- /dev/null
+++ 
b/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.support;
+
+import java.io.ByteArrayInputStream;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.text.MessageFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.stream.XMLStreamException;
+
+import junit.framework.TestCase;
+
+import org.apache.camel.Exchange;
+import org.apache.camel.builder.ExchangeBuilder;
+import org.apache.camel.impl.DefaultCamelContext;
+
+
+/**
+ *
+ */
+public class XMLTokenExpressionIteratorCharsetTest extends TestCase {
+    private static final String DATA_TEMPLATE = 
+        "<?xml version=\"1.0\" encoding=\"{0}\"?>"
+        + "<Statements xmlns=\"http://www.apache.org/xml/test\";>"
+        + "    <statement>we l\u00f3ve iso-latin</statement>"
+        + "    <statement>we h\u00e4te unicode</statement>"
+        + "</Statements>";
+
+    private static final String[] RESULTS = {
+        "<statement xmlns=\"http://www.apache.org/xml/test\";>we l\u00f3ve 
iso-latin</statement>",
+        "<statement xmlns=\"http://www.apache.org/xml/test\";>we h\u00e4te 
unicode</statement>"
+    };
+
+    private static final String DATA_STRING = 
MessageFormat.format(DATA_TEMPLATE, "utf-8");
+    private static final byte[] DATA_UTF8 = getBytes(DATA_TEMPLATE, "utf-8");
+    private static final byte[] DATA_ISOLATIN = getBytes(DATA_TEMPLATE, 
"iso-8859-1");
+
+    private static final Map<String, String> NSMAP = 
Collections.singletonMap("", "http://www.apache.org/xml/test";);
+
+    private Exchange exchange;
+
+    private static byte[] getBytes(String template, String charset) {
+        try {
+            return MessageFormat.format(template, charset).getBytes(charset);
+        } catch (UnsupportedEncodingException e) {
+            //ignore
+        }
+        return null;
+    }
+    
+    @Override
+    protected void setUp() throws Exception {
+        exchange = ExchangeBuilder.anExchange(new 
DefaultCamelContext()).build();
+    }
+
+    public void testTokenzeWithUTF8() throws Exception {
+        XMLTokenExpressionIterator xtei = new 
XMLTokenExpressionIterator("//statement", 'i');
+        xtei.setNamespaces(NSMAP);
+
+        exchange.getIn().setHeader(Exchange.CHARSET_NAME, "utf-8");
+        invokeAndVerify(xtei.createIterator(new 
ByteArrayInputStream(DATA_UTF8), exchange));
+    }
+
+    public void testTokenizeWithISOLatin() throws Exception {
+        XMLTokenExpressionIterator xtei = new 
XMLTokenExpressionIterator("//statement", 'i');
+        xtei.setNamespaces(NSMAP);
+
+        exchange.getIn().setHeader(Exchange.CHARSET_NAME, "iso-8859-1");
+        invokeAndVerify(xtei.createIterator(new 
ByteArrayInputStream(DATA_ISOLATIN), exchange));
+    }
+
+    public void testTokenizeWithReader() throws Exception {
+        XMLTokenExpressionIterator xtei = new 
XMLTokenExpressionIterator("//statement", 'i');
+        xtei.setNamespaces(NSMAP);
+
+        invokeAndVerify(xtei.createIterator(new StringReader(DATA_STRING)));
+    }
+
+    private void invokeAndVerify(Iterator<?> tokenizer) throws IOException, 
XMLStreamException {
+        List<String> results = new ArrayList<String>();
+        while (tokenizer.hasNext()) {
+            String token = (String)tokenizer.next();
+            results.add(token);
+        }
+        ((Closeable)tokenizer).close();
+        
+        assertEquals("token count", RESULTS.length, results.size());
+        for (int i = 0; i < RESULTS.length; i++) {
+            assertEquals("mismatch [" + i + "]", RESULTS[i], results.get(i));
+        }
+        
+    }
+
+}

git commit: CAMEL-7584: XML-Aware Tokenizer failing with utf-8 multibyte characters

Reply via email to