This is an automated email from the ASF dual-hosted git repository. markt pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tomcat.git
The following commit(s) were added to refs/heads/main by this push: new a9ca10266b Remove the customer UTF-8 decoder. a9ca10266b is described below commit a9ca10266bd5cf2919f59b1916ee786c01e2a033 Author: Mark Thomas <ma...@apache.org> AuthorDate: Tue Apr 26 18:01:51 2022 +0100 Remove the customer UTF-8 decoder. The issues this was introduced to work around were fixed in early Java 8 releases. Now the minimum Java version is 11, we can be sure we don't need the custom decoder. Retain the test that checks the JVM provided decoder to catch any regressions. --- java/org/apache/tomcat/util/buf/B2CConverter.java | 10 +- java/org/apache/tomcat/util/buf/Utf8Decoder.java | 299 ---------------------- java/org/apache/tomcat/websocket/WsFrameBase.java | 6 +- test/org/apache/tomcat/util/buf/TestUtf8.java | 9 - webapps/docs/changelog.xml | 6 + 5 files changed, 10 insertions(+), 320 deletions(-) diff --git a/java/org/apache/tomcat/util/buf/B2CConverter.java b/java/org/apache/tomcat/util/buf/B2CConverter.java index 532c209ec9..c7fd4b67b7 100644 --- a/java/org/apache/tomcat/util/buf/B2CConverter.java +++ b/java/org/apache/tomcat/util/buf/B2CConverter.java @@ -24,7 +24,6 @@ import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; -import java.nio.charset.StandardCharsets; import java.util.Locale; import org.apache.tomcat.util.res.StringManager; @@ -90,14 +89,7 @@ public class B2CConverter { } else { action = CodingErrorAction.REPORT; } - // Special case. Use the Apache Harmony based UTF-8 decoder because it - // - a) rejects invalid sequences that the JVM decoder does not - // - b) fails faster for some invalid sequences - if (charset.equals(StandardCharsets.UTF_8)) { - decoder = new Utf8Decoder(); - } else { - decoder = charset.newDecoder(); - } + decoder = charset.newDecoder(); decoder.onMalformedInput(action); decoder.onUnmappableCharacter(action); } diff --git a/java/org/apache/tomcat/util/buf/Utf8Decoder.java b/java/org/apache/tomcat/util/buf/Utf8Decoder.java deleted file mode 100644 index 932e88c764..0000000000 --- a/java/org/apache/tomcat/util/buf/Utf8Decoder.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tomcat.util.buf; - -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CoderResult; -import java.nio.charset.StandardCharsets; - -/** - * Decodes bytes to UTF-8. Extracted from Apache Harmony and modified to reject - * code points from U+D800 to U+DFFF as per RFC3629. The standard Java decoder - * does not reject these. It has also been modified to reject code points - * greater than U+10FFFF which the standard Java decoder rejects but the harmony - * one does not. - */ -public class Utf8Decoder extends CharsetDecoder { - - // The next table contains information about UTF-8 charset and - // correspondence of 1st byte to the length of sequence - // For information please visit http://www.ietf.org/rfc/rfc3629.txt - // - // Please note, o means 0, actually. - // ------------------------------------------------------------------- - // 0 1 2 3 Value - // ------------------------------------------------------------------- - // oxxxxxxx 00000000 00000000 0xxxxxxx - // 11oyyyyy 1oxxxxxx 00000000 00000yyy yyxxxxxx - // 111ozzzz 1oyyyyyy 1oxxxxxx 00000000 zzzzyyyy yyxxxxxx - // 1111ouuu 1ouuzzzz 1oyyyyyy 1oxxxxxx 000uuuuu zzzzyyyy yyxxxxxx - private static final int remainingBytes[] = { - // 1owwwwww - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - // 11oyyyyy - -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - // 111ozzzz - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - // 1111ouuu - 3, 3, 3, 3, 3, -1, -1, -1, - // > 11110111 - -1, -1, -1, -1, -1, -1, -1, -1}; - private static final int remainingNumbers[] = {0, // 0 1 2 3 - 4224, // (01o00000b << 6)+(1o000000b) - 401536, // (011o0000b << 12)+(1o000000b << 6)+(1o000000b) - 29892736 // (0111o000b << 18)+(1o000000b << 12)+(1o000000b << - // 6)+(1o000000b) - }; - private static final int lowerEncodingLimit[] = {-1, 0x80, 0x800, 0x10000}; - - - public Utf8Decoder() { - super(StandardCharsets.UTF_8, 1.0f, 1.0f); - } - - - @Override - protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { - if (in.hasArray() && out.hasArray()) { - return decodeHasArray(in, out); - } - return decodeNotHasArray(in, out); - } - - - private CoderResult decodeNotHasArray(ByteBuffer in, CharBuffer out) { - int outRemaining = out.remaining(); - int pos = in.position(); - int limit = in.limit(); - try { - while (pos < limit) { - if (outRemaining == 0) { - return CoderResult.OVERFLOW; - } - int jchar = in.get(); - if (jchar < 0) { - jchar = jchar & 0x7F; - int tail = remainingBytes[jchar]; - if (tail == -1) { - return CoderResult.malformedForLength(1); - } - if (limit - pos < 1 + tail) { - // No early test for invalid sequences here as peeking - // at the next byte is harder - return CoderResult.UNDERFLOW; - } - int nextByte; - for (int i = 0; i < tail; i++) { - nextByte = in.get() & 0xFF; - if ((nextByte & 0xC0) != 0x80) { - return CoderResult.malformedForLength(1 + i); - } - jchar = (jchar << 6) + nextByte; - } - jchar -= remainingNumbers[tail]; - if (jchar < lowerEncodingLimit[tail]) { - // Should have been encoded in a fewer octets - return CoderResult.malformedForLength(1); - } - pos += tail; - } - // Apache Tomcat added test - if (jchar >= 0xD800 && jchar <= 0xDFFF) { - return CoderResult.unmappableForLength(3); - } - // Apache Tomcat added test - if (jchar > 0x10FFFF) { - return CoderResult.unmappableForLength(4); - } - if (jchar <= 0xffff) { - out.put((char) jchar); - outRemaining--; - } else { - if (outRemaining < 2) { - return CoderResult.OVERFLOW; - } - out.put((char) ((jchar >> 0xA) + 0xD7C0)); - out.put((char) ((jchar & 0x3FF) + 0xDC00)); - outRemaining -= 2; - } - pos++; - } - return CoderResult.UNDERFLOW; - } finally { - in.position(pos); - } - } - - - private CoderResult decodeHasArray(ByteBuffer in, CharBuffer out) { - int outRemaining = out.remaining(); - int pos = in.position(); - int limit = in.limit(); - final byte[] bArr = in.array(); - final char[] cArr = out.array(); - final int inIndexLimit = limit + in.arrayOffset(); - int inIndex = pos + in.arrayOffset(); - int outIndex = out.position() + out.arrayOffset(); - // if someone would change the limit in process, - // they would face consequences - for (; inIndex < inIndexLimit && outRemaining > 0; inIndex++) { - int jchar = bArr[inIndex]; - if (jchar < 0) { - jchar = jchar & 0x7F; - // If first byte is invalid, tail will be set to -1 - int tail = remainingBytes[jchar]; - if (tail == -1) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // Additional checks to detect invalid sequences ASAP - // Checks derived from Unicode 6.2, Chapter 3, Table 3-7 - // Check 2nd byte - int tailAvailable = inIndexLimit - inIndex - 1; - if (tailAvailable > 0) { - // First byte C2..DF, second byte 80..BF - if (jchar > 0x41 && jchar < 0x60 && - (bArr[inIndex + 1] & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte E0, second byte A0..BF - if (jchar == 0x60 && (bArr[inIndex + 1] & 0xE0) != 0xA0) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte E1..EC, second byte 80..BF - if (jchar > 0x60 && jchar < 0x6D && - (bArr[inIndex + 1] & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte ED, second byte 80..9F - if (jchar == 0x6D && (bArr[inIndex + 1] & 0xE0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte EE..EF, second byte 80..BF - if (jchar > 0x6D && jchar < 0x70 && - (bArr[inIndex + 1] & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte F0, second byte 90..BF - if (jchar == 0x70 && - ((bArr[inIndex + 1] & 0xFF) < 0x90 || - (bArr[inIndex + 1] & 0xFF) > 0xBF)) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte F1..F3, second byte 80..BF - if (jchar > 0x70 && jchar < 0x74 && - (bArr[inIndex + 1] & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - // First byte F4, second byte 80..8F - if (jchar == 0x74 && - (bArr[inIndex + 1] & 0xF0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - } - // Check third byte if present and expected - if (tailAvailable > 1 && tail > 1) { - if ((bArr[inIndex + 2] & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(2); - } - } - // Check fourth byte if present and expected - if (tailAvailable > 2 && tail > 2) { - if ((bArr[inIndex + 3] & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(3); - } - } - if (tailAvailable < tail) { - break; - } - for (int i = 0; i < tail; i++) { - int nextByte = bArr[inIndex + i + 1] & 0xFF; - if ((nextByte & 0xC0) != 0x80) { - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1 + i); - } - jchar = (jchar << 6) + nextByte; - } - jchar -= remainingNumbers[tail]; - if (jchar < lowerEncodingLimit[tail]) { - // Should have been encoded in fewer octets - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.malformedForLength(1); - } - inIndex += tail; - } - // Apache Tomcat added test - if (jchar >= 0xD800 && jchar <= 0xDFFF) { - return CoderResult.unmappableForLength(3); - } - // Apache Tomcat added test - if (jchar > 0x10FFFF) { - return CoderResult.unmappableForLength(4); - } - if (jchar <= 0xffff) { - cArr[outIndex++] = (char) jchar; - outRemaining--; - } else { - if (outRemaining < 2) { - // Encoded with 4 bytes. inIndex currently points - // to the final byte. Move it back to first byte. - inIndex -= 3; - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return CoderResult.OVERFLOW; - } - cArr[outIndex++] = (char) ((jchar >> 0xA) + 0xD7C0); - cArr[outIndex++] = (char) ((jchar & 0x3FF) + 0xDC00); - outRemaining -= 2; - } - } - in.position(inIndex - in.arrayOffset()); - out.position(outIndex - out.arrayOffset()); - return (outRemaining == 0 && inIndex < inIndexLimit) ? - CoderResult.OVERFLOW : - CoderResult.UNDERFLOW; - } -} diff --git a/java/org/apache/tomcat/websocket/WsFrameBase.java b/java/org/apache/tomcat/websocket/WsFrameBase.java index fd8c4ebab0..10dfb7913d 100644 --- a/java/org/apache/tomcat/websocket/WsFrameBase.java +++ b/java/org/apache/tomcat/websocket/WsFrameBase.java @@ -22,6 +22,7 @@ import java.nio.CharBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; @@ -33,7 +34,6 @@ import jakarta.websocket.PongMessage; import org.apache.juli.logging.Log; import org.apache.tomcat.util.ExceptionUtils; -import org.apache.tomcat.util.buf.Utf8Decoder; import org.apache.tomcat.util.res.StringManager; /** @@ -57,10 +57,10 @@ public abstract class WsFrameBase { private final CharBuffer controlBufferText = CharBuffer.allocate(125); // Attributes of the current message - private final CharsetDecoder utf8DecoderControl = new Utf8Decoder(). + private final CharsetDecoder utf8DecoderControl = StandardCharsets.UTF_8.newDecoder(). onMalformedInput(CodingErrorAction.REPORT). onUnmappableCharacter(CodingErrorAction.REPORT); - private final CharsetDecoder utf8DecoderMessage = new Utf8Decoder(). + private final CharsetDecoder utf8DecoderMessage = StandardCharsets.UTF_8.newDecoder(). onMalformedInput(CodingErrorAction.REPORT). onUnmappableCharacter(CodingErrorAction.REPORT); private boolean continuationExpected = false; diff --git a/test/org/apache/tomcat/util/buf/TestUtf8.java b/test/org/apache/tomcat/util/buf/TestUtf8.java index 71e8ef30c1..3bc55a64e1 100644 --- a/test/org/apache/tomcat/util/buf/TestUtf8.java +++ b/test/org/apache/tomcat/util/buf/TestUtf8.java @@ -316,15 +316,6 @@ public class TestUtf8 { TEST_CASES = Collections.unmodifiableList(testCases); } - @Test - public void testHarmonyDecoder() { - CharsetDecoder decoder = new Utf8Decoder(); - for (Utf8TestCase testCase : TEST_CASES) { - doTest(decoder, testCase); - } - } - - @Test public void testJvmDecoder() { CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder(); diff --git a/webapps/docs/changelog.xml b/webapps/docs/changelog.xml index 85ea9b3301..01949d0425 100644 --- a/webapps/docs/changelog.xml +++ b/webapps/docs/changelog.xml @@ -137,6 +137,12 @@ of the OS that uses kernel 5.10 or later. Thanks to Christopher Gual for the research into this issue. (markt) </fix> + <scode> + Remove the custom UTF-decoder that was introduced to work around various + UTF-8 decoding bugs in Java. These issues were fixed in early Java 8 + releases. Now the minimum Java version is 11, we can be sure that Tomcat + will not be running on a JRE where these issues are present. (markt) + </scode> </changelog> </subsection> <subsection name="Jasper"> --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org For additional commands, e-mail: dev-h...@tomcat.apache.org