This is an automated email from the ASF dual-hosted git repository.
rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-mime4j.git
The following commit(s) were added to refs/heads/master by this push:
new bdb2264a Fixes parsing headers containing UTF-8 characters (#103)
bdb2264a is described below
commit bdb2264abd17badf70e27f7e32907bccc82d005e
Author: Thomas <[email protected]>
AuthorDate: Fri Apr 26 04:51:34 2024 +0200
Fixes parsing headers containing UTF-8 characters (#103)
---
.../apache/james/mime4j/stream/RawFieldParser.java | 46 +++++++---------------
.../org/apache/james/mime4j/util/CharsetUtil.java | 21 ++++++++++
.../org/apache/james/mime4j/util/ContentUtil.java | 7 +++-
.../james/mime4j/stream/RawFieldParserTest.java | 28 +++++++++++++
.../field/LenientContentDispositionFieldTest.java | 23 +++--------
5 files changed, 75 insertions(+), 50 deletions(-)
diff --git
a/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java
b/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java
index b546b302..8437927f 100644
--- a/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java
+++ b/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java
@@ -195,12 +195,6 @@ public class RawFieldParser {
* is not delimited by any character.
*/
public String parseValue(final ByteSequence buf, final ParserCursor
cursor, final BitSet delimiters) {
- if (!CharsetUtil.isASCII(buf)) {
- String value = parseUtf8Filename(buf);
- if (value != null)
- return value;
- }
-
StringBuilder dst = new StringBuilder();
boolean whitespace = false;
while (!cursor.atEnd()) {
@@ -229,25 +223,6 @@ public class RawFieldParser {
return dst.toString();
}
- /**
- * Special case for parsing {@code filename} attribute in nonstandard
encoding like:
- * {@code Content-Disposition: attachment; filename="УПД ОБЩЕСТВО С
ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ "СТАНЦИЯ ВИРТУАЛЬНАЯ" 01-05-21.pdf"}
- *
- * @param buf field raw.
- * @return filename value or {@code null}.
- */
- private String parseUtf8Filename(ByteSequence buf) {
- final String value = new String(buf.toByteArray(),
StandardCharsets.UTF_8);
-
- final String prefix = "filename=\"";
- final int pos = value.indexOf(prefix);
- if (pos > 0) {
- return value.substring(pos + prefix.length(), value.length() - 1);
- }
-
- return null;
- }
-
/**
* Skips semantically insignificant whitespace characters and moves the
cursor to the closest
* non-whitespace character.
@@ -379,16 +354,22 @@ public class RawFieldParser {
int pos = cursor.getPos();
int indexFrom = cursor.getPos();
int indexTo = cursor.getUpperBound();
+
+ ByteArrayBuffer dstRaw = new ByteArrayBuffer(indexTo - indexFrom);
+
for (int i = indexFrom; i < indexTo; i++) {
- char current = (char) (buf.byteAt(i) & 0xff);
+ byte currentByte = buf.byteAt(i);
+ char current = (char) (currentByte & 0xff);
if ((delimiters != null && delimiters.get(current))
|| CharsetUtil.isWhitespace(current) || current == '(' ||
current == '\"') {
break;
} else {
pos++;
- dst.append(current);
+ dstRaw.append(currentByte);
}
}
+ String decoded = CharsetUtil.isASCII(dstRaw) ?
ContentUtil.decode(dstRaw) : ContentUtil.decode(StandardCharsets.UTF_8, dstRaw);
+ dst.append(decoded);
cursor.updatePos(pos);
}
@@ -414,16 +395,17 @@ public class RawFieldParser {
pos++;
indexFrom++;
- ByteArrayBuffer dstRaw = new ByteArrayBuffer(200);
+ ByteArrayBuffer dstRaw = new ByteArrayBuffer(indexTo - indexFrom);
boolean escaped = false;
for (int i = indexFrom; i < indexTo; i++, pos++) {
- current = (char) (buf.byteAt(i) & 0xff);
+ byte currentByte = buf.byteAt(i);
+ current = (char) (currentByte & 0xff);
if (escaped) {
if (current != '\"' && current != '\\') {
dstRaw.append('\\');
}
- dstRaw.append(current);
+ dstRaw.append(currentByte);
escaped = false;
} else {
if (current == '\"') {
@@ -433,12 +415,12 @@ public class RawFieldParser {
if (current == '\\') {
escaped = true;
} else if (current != '\r' && current != '\n') {
- dstRaw.append(current);
+ dstRaw.append(currentByte);
}
}
}
- String decoded = ContentUtil.decode(dstRaw);
+ String decoded = CharsetUtil.isASCII(dstRaw) ?
ContentUtil.decode(dstRaw) : ContentUtil.decode(StandardCharsets.UTF_8, dstRaw);
if (decoded.startsWith("=?")) {
decoded = DecoderUtil.decodeEncodedWords(decoded,
DecodeMonitor.SILENT);
}
diff --git a/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java
b/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java
index 0a9c983c..4503cbdf 100644
--- a/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java
+++ b/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java
@@ -95,6 +95,27 @@ public class CharsetUtil {
return true;
}
+ /**
+ * Returns <code>true</code> if the specified string consists entirely of
+ * US ASCII characters.
+ *
+ * @param s
+ * string to test.
+ * @return <code>true</code> if the specified string consists entirely of
+ * US ASCII characters, <code>false</code> otherwise.
+ */
+ public static boolean isASCII(final CharSequence s) {
+ if (s == null) {
+ throw new IllegalArgumentException("String may not be null");
+ }
+ final int len = s.length();
+ for (int i = 0; i < len; i++) {
+ if (!isASCII(s.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
/**
* Returns <code>true</code> if the specified character is a whitespace
* character (CR, LF, SP or HT).
diff --git a/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java
b/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java
index 1e078120..b5c365dc 100644
--- a/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java
+++ b/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java
@@ -30,6 +30,7 @@ import java.lang.ref.SoftReference;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.james.mime4j.Charsets;
@@ -131,7 +132,8 @@ public class ContentUtil {
/**
* Encodes the specified string into an immutable sequence of bytes using
- * the US-ASCII charset.
+ * the US-ASCII charset or UTF-8 in case none ASCII characters are in the
+ * sequence.
*
* @param string
* string to encode.
@@ -141,6 +143,9 @@ public class ContentUtil {
if (string == null) {
return null;
}
+ if (!CharsetUtil.isASCII(string)) {
+ return encode(StandardCharsets.UTF_8, string);
+ }
ByteArrayBuffer buf = new ByteArrayBuffer(string.length());
for (int i = 0; i < string.length(); i++) {
buf.append((byte) string.charAt(i));
diff --git
a/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java
b/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java
index dff799e3..ca41198e 100644
--- a/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java
+++ b/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java
@@ -461,4 +461,32 @@ public class RawFieldParserTest {
org.junit.Assert.assertEquals("simple boundary",
params.get(0).getValue());
}
+ @Test
+ public void testRegressionForContentDispositionParsingASCIIonly() {
+ ByteSequence buf = ContentUtil.encode(
+ "name=\"filedata\"; filename=\"Sanity a.doc\"");
+ ParserCursor cursor = new ParserCursor(0, buf.length());
+ List<NameValuePair> params = parser.parseParameters(buf, cursor);
+
+ org.junit.Assert.assertEquals(2, params.size());
+ org.junit.Assert.assertEquals("name", params.get(0).getName());
+ org.junit.Assert.assertEquals("filedata", params.get(0).getValue());
+ org.junit.Assert.assertEquals("filename", params.get(1).getName());
+ org.junit.Assert.assertEquals("Sanity a.doc",
params.get(1).getValue());
+ }
+
+ @Test
+ public void testRegressionForContentDispositionParsingUTF8() {
+ ByteSequence buf = ContentUtil.encode("name=\"filedata\";
filename=\"Sanity ä.doc\"");
+ ParserCursor cursor = new ParserCursor(0, buf.length());
+ List<NameValuePair> params = parser.parseParameters(buf, cursor);
+
+ org.junit.Assert.assertEquals(2, params.size());
+ org.junit.Assert.assertEquals("name", params.get(0).getName());
+ org.junit.Assert.assertEquals("filedata", params.get(0).getValue());
+ org.junit.Assert.assertEquals("filename", params.get(1).getName());
+ org.junit.Assert.assertEquals("Sanity ä.doc",
params.get(1).getValue());
+ }
+
+
}
diff --git
a/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java
b/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java
index 5978f3b2..eb563e28 100644
---
a/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java
+++
b/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java
@@ -19,14 +19,12 @@
package org.apache.james.mime4j.field;
-import java.nio.charset.StandardCharsets;
import java.util.Date;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.dom.field.ContentDispositionField;
import org.apache.james.mime4j.stream.RawField;
import org.apache.james.mime4j.stream.RawFieldParser;
-import org.apache.james.mime4j.util.ByteArrayBuffer;
import org.apache.james.mime4j.util.ByteSequence;
import org.apache.james.mime4j.util.ContentUtil;
import org.junit.Assert;
@@ -40,11 +38,6 @@ public class LenientContentDispositionFieldTest {
return ContentDispositionFieldLenientImpl.PARSER.parse(rawField, null);
}
- static ContentDispositionField parse(final byte[] raw) throws
MimeException {
- RawField rawField = RawFieldParser.DEFAULT.parseField(new
ByteArrayBuffer(raw, true));
- return ContentDispositionFieldLenientImpl.PARSER.parse(rawField, null);
- }
-
@Test
public void testDispositionTypeWithSemiColonNoParams() throws Exception {
ContentDispositionField f = parse("Content-Disposition: inline;");
@@ -120,10 +113,9 @@ public class LenientContentDispositionFieldTest {
@Test
public void testGetFilenameEncoded() throws Exception {
- byte[] data = ("Content-Disposition: attachment;\n" +
+ String data = "Content-Disposition: attachment;\n" +
"
FileName=\"=?WINDOWS-1251?Q?3244659=5F=C0=EA=F2_=E7=E0_=C8=FE=EB=FC_?=\n" +
- " =?WINDOWS-1251?Q?2020.pdf?=\"")
- .getBytes(StandardCharsets.UTF_8);
+ " =?WINDOWS-1251?Q?2020.pdf?=\"";
ContentDispositionField f = parse(data);
@@ -132,10 +124,8 @@ public class LenientContentDispositionFieldTest {
@Test
public void testGetFilenameUtf8() throws Exception {
- byte[] data =
- "Content-Disposition: attachment; filename=\"УПД ОБЩЕСТВО С
ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ \"СТАНЦИЯ ВИРТУАЛЬНАЯ\" 01-05-21.pdf\""
- .getBytes(StandardCharsets.UTF_8);
-
+ String data =
+ "Content-Disposition: attachment; filename=\"УПД ОБЩЕСТВО С
ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ \\\"СТАНЦИЯ ВИРТУАЛЬНАЯ\\\" 01-05-21.pdf\"";
ContentDispositionField f = parse(data);
Assert.assertEquals("UTF8 encoded filename", "УПД ОБЩЕСТВО С
ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ \"СТАНЦИЯ ВИРТУАЛЬНАЯ\" 01-05-21.pdf",
f.getFilename());
@@ -143,10 +133,9 @@ public class LenientContentDispositionFieldTest {
@Test
public void testGetFilenameMultipartUtf8() throws Exception {
- byte[] data = ("Content-Disposition: attachment;\n" +
+ String data = "Content-Disposition: attachment;\n" +
"
filename*0*=\"UTF-8''%D0%A0%D0%BE%D1%81%D1%82%D0%B5%D0%BB%D0%B5%D0%BA%D0%BE\";\n"
+
- " filename*1*=\"%D0%BC%2E%78%6C%73%78\"\n")
- .getBytes(StandardCharsets.UTF_8);
+ " filename*1*=\"%D0%BC%2E%78%6C%73%78\"\n";
ContentDispositionField f = parse(data);
Assert.assertEquals("Ростелеком.xlsx", f.getFilename());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]