Repository: camel Updated Branches: refs/heads/master 14b61640a -> 309ca3b1f
CAMEL-7584: XML-Aware Tokenizer failing with utf-8 multibyte characters Project: http://git-wip-us.apache.org/repos/asf/camel/repo Commit: http://git-wip-us.apache.org/repos/asf/camel/commit/309ca3b1 Tree: http://git-wip-us.apache.org/repos/asf/camel/tree/309ca3b1 Diff: http://git-wip-us.apache.org/repos/asf/camel/diff/309ca3b1 Branch: refs/heads/master Commit: 309ca3b1fe362ddb49daedc596a114421f6cd497 Parents: 14b6164 Author: Akitoshi Yoshida <a...@apache.org> Authored: Mon Jul 7 23:58:47 2014 +0200 Committer: Akitoshi Yoshida <a...@apache.org> Committed: Mon Jul 7 23:59:06 2014 +0200 ---------------------------------------------------------------------- .../apache/camel/support/RecordableReader.java | 94 +++++++++++++++ .../support/XMLTokenExpressionIterator.java | 39 +++++-- .../XMLTokenExpressionIteratorCharsetTest.java | 116 +++++++++++++++++++ 3 files changed, 242 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/camel/blob/309ca3b1/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java ---------------------------------------------------------------------- diff --git a/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java b/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java new file mode 100644 index 0000000..776993b --- /dev/null +++ b/camel-core/src/main/java/org/apache/camel/support/RecordableReader.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.camel.support; + +import java.io.CharArrayWriter; +import java.io.FilterReader; +import java.io.IOException; +import java.io.Reader; + +/** + * This class is used by the toknizer to extract data while reading from the stream. + * REVIST it is used package internally but may be moved to some common package. + */ +class RecordableReader extends FilterReader { + private TrimmableCharArrayWriter buf; + private boolean recording; + protected RecordableReader(Reader in) { + super(in); + this.buf = new TrimmableCharArrayWriter(); + this.recording = true; + } + + @Override + public int read() throws IOException { + int c = super.read(); + if (c > 0 && recording) { + buf.write(c); + } + return c; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int n = super.read(cbuf, off, len); + if (n > 0 && recording) { + buf.write(cbuf, off, n); + } + return n; + } + + public String getText(int pos) { + recording = false; + String t = new String(buf.getCharArray(), 0, pos); + buf.trim(pos, 0); + return t; + } + + public char[] getChars(int pos) { + recording = false; + char[] b = buf.toCharArray(pos); + buf.trim(pos, 0); + return b; + } + + public void record() { + recording = true; + } + + int size() { + return buf.size(); + } + + private static class TrimmableCharArrayWriter extends CharArrayWriter { + public void trim(int head, int tail) { + System.arraycopy(buf, head, buf, 0, count - head - tail); + count -= head + tail; + } + + public char[] toCharArray(int len) { + char[] b = new char[len]; + System.arraycopy(buf, 0, b, 0, len); + return b; + } + + char[] getCharArray() { + return buf; + } + } + +} http://git-wip-us.apache.org/repos/asf/camel/blob/309ca3b1/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java ---------------------------------------------------------------------- diff --git a/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java b/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java index a9ca796..e0724f4 100644 --- a/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java +++ b/camel-core/src/main/java/org/apache/camel/support/XMLTokenExpressionIterator.java @@ -19,6 +19,9 @@ package org.apache.camel.support; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -71,8 +74,20 @@ public class XMLTokenExpressionIterator extends ExpressionAdapter implements Nam this.mode = mode != null ? mode.charAt(0) : 0; } - protected Iterator<?> createIterator(InputStream in, Exchange exchange) throws XMLStreamException { - XMLTokenIterator iterator = new XMLTokenIterator(path, nsmap, mode, in, exchange); + protected Iterator<?> createIterator(InputStream in, Exchange exchange) throws XMLStreamException, UnsupportedEncodingException { + String charset = IOHelper.getCharsetName(exchange, false); + Reader reader; + if (charset == null) { + reader = new InputStreamReader(in); + } else { + reader = new InputStreamReader(in, charset); + } + XMLTokenIterator iterator = new XMLTokenIterator(path, nsmap, mode, reader); + return iterator; + } + + protected Iterator<?> createIterator(Reader in) throws XMLStreamException { + XMLTokenIterator iterator = new XMLTokenIterator(path, nsmap, mode, in); return iterator; } @@ -112,6 +127,11 @@ public class XMLTokenExpressionIterator extends ExpressionAdapter implements Nam // must close input stream IOHelper.close(in); return null; + } catch (UnsupportedEncodingException e) { + exchange.setException(e); + // must close input stream + IOHelper.close(in); + return null; } finally { if (closeStream) { IOHelper.close(in); @@ -127,7 +147,7 @@ public class XMLTokenExpressionIterator extends ExpressionAdapter implements Nam private AttributedQName[] splitpath; private int index; private char mode; - private RecordableInputStream in; + private RecordableReader in; private XMLStreamReader reader; private List<QName> path; private List<Map<String, String>> namespaces; @@ -141,7 +161,13 @@ public class XMLTokenExpressionIterator extends ExpressionAdapter implements Nam private Object nextToken; - public XMLTokenIterator(String path, Map<String, String> nsmap, char mode, InputStream in, Exchange exchange) throws XMLStreamException { + public XMLTokenIterator(String path, Map<String, String> nsmap, char mode, InputStream in, String charset) + throws XMLStreamException, UnsupportedEncodingException { + // woodstox's getLocation().etCharOffset() does not return the offset correctly for InputStream, so use Reader instead. + this(path, nsmap, mode, new InputStreamReader(in, charset)); + } + + public XMLTokenIterator(String path, Map<String, String> nsmap, char mode, Reader in) throws XMLStreamException { final String[] sl = path.substring(1).split("/"); this.splitpath = new AttributedQName[sl.length]; for (int i = 0; i < sl.length; i++) { @@ -156,9 +182,8 @@ public class XMLTokenExpressionIterator extends ExpressionAdapter implements Nam } this.mode = mode != 0 ? mode : 'i'; - String charset = IOHelper.getCharsetName(exchange, false); - this.in = new RecordableInputStream(in, charset); - this.reader = new StaxConverter().createXMLStreamReader(this.in, exchange); + this.in = new RecordableReader(in); + this.reader = new StaxConverter().createXMLStreamReader(this.in); LOG.trace("reader.class: {}", reader.getClass()); int coff = reader.getLocation().getCharacterOffset(); http://git-wip-us.apache.org/repos/asf/camel/blob/309ca3b1/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java ---------------------------------------------------------------------- diff --git a/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java b/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java new file mode 100644 index 0000000..1d146b6 --- /dev/null +++ b/camel-core/src/test/java/org/apache/camel/support/XMLTokenExpressionIteratorCharsetTest.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.camel.support; + +import java.io.ByteArrayInputStream; +import java.io.Closeable; +import java.io.IOException; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import javax.xml.stream.XMLStreamException; + +import junit.framework.TestCase; + +import org.apache.camel.Exchange; +import org.apache.camel.builder.ExchangeBuilder; +import org.apache.camel.impl.DefaultCamelContext; + + +/** + * + */ +public class XMLTokenExpressionIteratorCharsetTest extends TestCase { + private static final String DATA_TEMPLATE = + "<?xml version=\"1.0\" encoding=\"{0}\"?>" + + "<Statements xmlns=\"http://www.apache.org/xml/test\">" + + " <statement>we l\u00f3ve iso-latin</statement>" + + " <statement>we h\u00e4te unicode</statement>" + + "</Statements>"; + + private static final String[] RESULTS = { + "<statement xmlns=\"http://www.apache.org/xml/test\">we l\u00f3ve iso-latin</statement>", + "<statement xmlns=\"http://www.apache.org/xml/test\">we h\u00e4te unicode</statement>" + }; + + private static final String DATA_STRING = MessageFormat.format(DATA_TEMPLATE, "utf-8"); + private static final byte[] DATA_UTF8 = getBytes(DATA_TEMPLATE, "utf-8"); + private static final byte[] DATA_ISOLATIN = getBytes(DATA_TEMPLATE, "iso-8859-1"); + + private static final Map<String, String> NSMAP = Collections.singletonMap("", "http://www.apache.org/xml/test"); + + private Exchange exchange; + + private static byte[] getBytes(String template, String charset) { + try { + return MessageFormat.format(template, charset).getBytes(charset); + } catch (UnsupportedEncodingException e) { + //ignore + } + return null; + } + + @Override + protected void setUp() throws Exception { + exchange = ExchangeBuilder.anExchange(new DefaultCamelContext()).build(); + } + + public void testTokenzeWithUTF8() throws Exception { + XMLTokenExpressionIterator xtei = new XMLTokenExpressionIterator("//statement", 'i'); + xtei.setNamespaces(NSMAP); + + exchange.getIn().setHeader(Exchange.CHARSET_NAME, "utf-8"); + invokeAndVerify(xtei.createIterator(new ByteArrayInputStream(DATA_UTF8), exchange)); + } + + public void testTokenizeWithISOLatin() throws Exception { + XMLTokenExpressionIterator xtei = new XMLTokenExpressionIterator("//statement", 'i'); + xtei.setNamespaces(NSMAP); + + exchange.getIn().setHeader(Exchange.CHARSET_NAME, "iso-8859-1"); + invokeAndVerify(xtei.createIterator(new ByteArrayInputStream(DATA_ISOLATIN), exchange)); + } + + public void testTokenizeWithReader() throws Exception { + XMLTokenExpressionIterator xtei = new XMLTokenExpressionIterator("//statement", 'i'); + xtei.setNamespaces(NSMAP); + + invokeAndVerify(xtei.createIterator(new StringReader(DATA_STRING))); + } + + private void invokeAndVerify(Iterator<?> tokenizer) throws IOException, XMLStreamException { + List<String> results = new ArrayList<String>(); + while (tokenizer.hasNext()) { + String token = (String)tokenizer.next(); + results.add(token); + } + ((Closeable)tokenizer).close(); + + assertEquals("token count", RESULTS.length, results.size()); + for (int i = 0; i < RESULTS.length; i++) { + assertEquals("mismatch [" + i + "]", RESULTS[i], results.get(i)); + } + + } + +}