(tomcat) 02/05: Add a common header parser for headers and trailers

markt Fri, 26 Apr 2024 10:11:39 -0700

This is an automated email from the ASF dual-hosted git repository.

markt pushed a commit to branch 9.0.x
in repository https://gitbox.apache.org/repos/asf/tomcat.git


commit e87a432f88663064f066509b53977da2fa2a9bd9
Author: Mark Thomas <ma...@apache.org>
AuthorDate: Fri Apr 26 15:47:23 2024 +0100

    Add a common header parser for headers and trailers
---
 .../tomcat/util/http/parser/HttpHeaderParser.java  | 409 +++++++++++++++++++++
 .../util/http/parser/LocalStrings.properties       |   3 +
 .../util/http/parser/LocalStrings_fr.properties    |   2 +
 .../util/http/parser/LocalStrings_ja.properties    |   2 +
 4 files changed, 416 insertions(+)

diff --git a/java/org/apache/tomcat/util/http/parser/HttpHeaderParser.java 
b/java/org/apache/tomcat/util/http/parser/HttpHeaderParser.java
new file mode 100644
index 0000000000..7ef3b8b5ee
--- /dev/null
+++ b/java/org/apache/tomcat/util/http/parser/HttpHeaderParser.java
@@ -0,0 +1,409 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.tomcat.util.http.parser;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.tomcat.util.buf.MessageBytes;
+import org.apache.tomcat.util.http.HeaderUtil;
+import org.apache.tomcat.util.http.MimeHeaders;
+import org.apache.tomcat.util.res.StringManager;
+
+public class HttpHeaderParser {
+
+    private static final StringManager sm = 
StringManager.getManager(HttpHeaderParser.class);
+
+    private static final byte CR = (byte) '\r';
+    private static final byte LF = (byte) '\n';
+    private static final byte SP = (byte) ' ';
+    private static final byte HT = (byte) '\t';
+    private static final byte COLON = (byte) ':';
+    private static final byte A = (byte) 'A';
+    private static final byte a = (byte) 'a';
+    private static final byte Z = (byte) 'Z';
+    private static final byte LC_OFFSET = A - a;
+
+    private final HeaderDataSource source;
+    private final MimeHeaders headers;
+    private final boolean tolerantEol;
+    private final HeaderParseData headerData = new HeaderParseData();
+
+    private HeaderParsePosition headerParsePos = 
HeaderParsePosition.HEADER_START;
+    private byte prevChr = 0;
+    private byte chr = 0;
+
+
+    public HttpHeaderParser(HeaderDataSource source, MimeHeaders headers, 
boolean tolerantEol) {
+        this.source = source;
+        this.headers = headers;
+        this.tolerantEol = tolerantEol;
+    }
+
+
+    public void recycle() {
+        chr = 0;
+        prevChr = 0;
+        headerParsePos = HeaderParsePosition.HEADER_START;
+        headerData.recycle();
+    }
+
+
+    /**
+     * Parse an HTTP header.
+     *
+     * @return One of {@link HeaderParseStatus#NEED_MORE_DATA}, {@link 
HeaderParseStatus#HAVE_MORE_HEADERS} or
+     *             {@link HeaderParseStatus#DONE}.
+     *
+     * @throws IOException If an error occurs during the parsing of the headers
+     */
+    public HeaderParseStatus parseHeader() throws IOException {
+
+        while (headerParsePos == HeaderParsePosition.HEADER_START) {
+
+            // Read new bytes if needed
+            if (source.getHeaderByteBuffer().position() >= 
source.getHeaderByteBuffer().limit()) {
+                if (!source.fillHeaderBuffer()) {
+                    return HeaderParseStatus.NEED_MORE_DATA;
+                }
+            }
+
+            prevChr = chr;
+            chr = source.getHeaderByteBuffer().get();
+
+            if (chr == CR && prevChr != CR) {
+                // Possible start of CRLF - process the next byte.
+            } else if (chr == LF) {
+                if (!tolerantEol && prevChr != CR) {
+                    throw new 
IllegalArgumentException(sm.getString("httpHeaderParser.invalidCrlfNoCR"));
+                }
+                return HeaderParseStatus.DONE;
+            } else {
+                if (prevChr == CR) {
+                    // Must have read two bytes (first was CR, second was not 
LF)
+                    
source.getHeaderByteBuffer().position(source.getHeaderByteBuffer().position() - 
2);
+                } else {
+                    // Must have only read one byte
+                    
source.getHeaderByteBuffer().position(source.getHeaderByteBuffer().position() - 
1);
+                }
+                break;
+            }
+        }
+
+        if (headerParsePos == HeaderParsePosition.HEADER_START) {
+            // Mark the current buffer position
+            headerData.start = source.getHeaderByteBuffer().position();
+            headerData.lineStart = headerData.start;
+            headerParsePos = HeaderParsePosition.HEADER_NAME;
+        }
+
+        //
+        // Reading the header name
+        // Header name is always US-ASCII
+        //
+
+        while (headerParsePos == HeaderParsePosition.HEADER_NAME) {
+
+            // Read new bytes if needed
+            if (source.getHeaderByteBuffer().position() >= 
source.getHeaderByteBuffer().limit()) {
+                if (!source.fillHeaderBuffer()) {
+                    return HeaderParseStatus.NEED_MORE_DATA;
+                }
+            }
+
+            int pos = source.getHeaderByteBuffer().position();
+            chr = source.getHeaderByteBuffer().get();
+            if (chr == COLON) {
+                if (headerData.start == pos) {
+                    // Zero length header name - not valid.
+                    // skipLine() will handle the error
+                    return skipLine();
+                }
+                headerParsePos = HeaderParsePosition.HEADER_VALUE_START;
+                headerData.headerValue = 
headers.addValue(source.getHeaderByteBuffer().array(), headerData.start,
+                        pos - headerData.start);
+                pos = source.getHeaderByteBuffer().position();
+                // Mark the current buffer position
+                headerData.start = pos;
+                headerData.realPos = pos;
+                headerData.lastSignificantChar = pos;
+                break;
+            } else if (!HttpParser.isToken(chr)) {
+                // Non-token characters are illegal in header names
+                // Parsing continues so the error can be reported in context
+                headerData.lastSignificantChar = pos;
+                
source.getHeaderByteBuffer().position(source.getHeaderByteBuffer().position() - 
1);
+                // skipLine() will handle the error
+                return skipLine();
+            }
+
+            // chr is next byte of header name. Convert to lowercase.
+            if (chr >= A && chr <= Z) {
+                source.getHeaderByteBuffer().put(pos, (byte) (chr - 
LC_OFFSET));
+            }
+        }
+
+        // Skip the line and ignore the header
+        if (headerParsePos == HeaderParsePosition.HEADER_SKIPLINE) {
+            return skipLine();
+        }
+
+        //
+        // Reading the header value (which can be spanned over multiple lines)
+        //
+
+        while (headerParsePos == HeaderParsePosition.HEADER_VALUE_START ||
+                headerParsePos == HeaderParsePosition.HEADER_VALUE ||
+                headerParsePos == HeaderParsePosition.HEADER_MULTI_LINE) {
+
+            if (headerParsePos == HeaderParsePosition.HEADER_VALUE_START) {
+                // Skipping spaces
+                while (true) {
+                    // Read new bytes if needed
+                    if (source.getHeaderByteBuffer().position() >= 
source.getHeaderByteBuffer().limit()) {
+                        if (!source.fillHeaderBuffer()) {
+                            return HeaderParseStatus.NEED_MORE_DATA;
+                        }
+                    }
+
+                    chr = source.getHeaderByteBuffer().get();
+                    if (chr != SP && chr != HT) {
+                        headerParsePos = HeaderParsePosition.HEADER_VALUE;
+                        
source.getHeaderByteBuffer().position(source.getHeaderByteBuffer().position() - 
1);
+                        // Avoids prevChr = chr at start of header value
+                        // parsing which causes problems when chr is CR
+                        // (in the case of an empty header value)
+                        chr = 0;
+                        break;
+                    }
+                }
+            }
+            if (headerParsePos == HeaderParsePosition.HEADER_VALUE) {
+
+                // Reading bytes until the end of the line
+                boolean eol = false;
+                while (!eol) {
+
+                    // Read new bytes if needed
+                    if (source.getHeaderByteBuffer().position() >= 
source.getHeaderByteBuffer().limit()) {
+                        if (!source.fillHeaderBuffer()) {
+                            return HeaderParseStatus.NEED_MORE_DATA;
+                        }
+                    }
+
+                    prevChr = chr;
+                    chr = source.getHeaderByteBuffer().get();
+                    if (chr == CR && prevChr != CR) {
+                        // CR is only permitted at the start of a CRLF 
sequence.
+                        // Possible start of CRLF - process the next byte.
+                    } else if (chr == LF) {
+                        if (!tolerantEol && prevChr != CR) {
+                            throw new 
IllegalArgumentException(sm.getString("httpHeaderParser.invalidCrlfNoCR"));
+                        }
+                        eol = true;
+                    } else if (prevChr == CR) {
+                        // Invalid value - also need to delete header
+                        return skipLine();
+                    } else if (HttpParser.isControl(chr) && chr != HT) {
+                        // Invalid value - also need to delete header
+                        return skipLine();
+                    } else if (chr == SP || chr == HT) {
+                        source.getHeaderByteBuffer().put(headerData.realPos, 
chr);
+                        headerData.realPos++;
+                    } else {
+                        source.getHeaderByteBuffer().put(headerData.realPos, 
chr);
+                        headerData.realPos++;
+                        headerData.lastSignificantChar = headerData.realPos;
+                    }
+                }
+
+                // Ignore whitespaces at the end of the line
+                headerData.realPos = headerData.lastSignificantChar;
+
+                // Checking the first character of the new line. If the 
character
+                // is a LWS, then it's a multiline header
+                headerParsePos = HeaderParsePosition.HEADER_MULTI_LINE;
+            }
+            // Read new bytes if needed
+            if (source.getHeaderByteBuffer().position() >= 
source.getHeaderByteBuffer().limit()) {
+                if (!source.fillHeaderBuffer()) {
+                    return HeaderParseStatus.NEED_MORE_DATA;
+                }
+            }
+
+            byte peek = 
source.getHeaderByteBuffer().get(source.getHeaderByteBuffer().position());
+            if (headerParsePos == HeaderParsePosition.HEADER_MULTI_LINE) {
+                if (peek != SP && peek != HT) {
+                    headerParsePos = HeaderParsePosition.HEADER_START;
+                    break;
+                } else {
+                    // Copying one extra space in the buffer (since there must
+                    // be at least one space inserted between the lines)
+                    source.getHeaderByteBuffer().put(headerData.realPos, peek);
+                    headerData.realPos++;
+                    headerParsePos = HeaderParsePosition.HEADER_VALUE_START;
+                }
+            }
+        }
+        // Set the header value
+        headerData.headerValue.setBytes(source.getHeaderByteBuffer().array(), 
headerData.start,
+                headerData.lastSignificantChar - headerData.start);
+        headerData.recycle();
+        return HeaderParseStatus.HAVE_MORE_HEADERS;
+    }
+
+
+    private HeaderParseStatus skipLine() throws IOException {
+        // Parse the rest of the invalid header so we can construct a useful
+        // exception and/or debug message.
+        headerParsePos = HeaderParsePosition.HEADER_SKIPLINE;
+        boolean eol = false;
+
+        // Reading bytes until the end of the line
+        while (!eol) {
+
+            // Read new bytes if needed
+            if (source.getHeaderByteBuffer().position() >= 
source.getHeaderByteBuffer().limit()) {
+                if (!source.fillHeaderBuffer()) {
+                    return HeaderParseStatus.NEED_MORE_DATA;
+                }
+            }
+
+            int pos = source.getHeaderByteBuffer().position();
+            prevChr = chr;
+            chr = source.getHeaderByteBuffer().get();
+            if (chr == CR) {
+                // Skip
+            } else if (chr == LF) {
+                if (!tolerantEol && prevChr != CR) {
+                    throw new 
IllegalArgumentException(sm.getString("httpHeaderParser.invalidCrlfNoCR"));
+                }
+                eol = true;
+            } else {
+                headerData.lastSignificantChar = pos;
+            }
+        }
+
+        throw new 
IllegalArgumentException(sm.getString("httpHeaderParser.invalidHeader",
+                
HeaderUtil.toPrintableString(source.getHeaderByteBuffer().array(), 
headerData.lineStart,
+                        headerData.lastSignificantChar - headerData.lineStart 
+ 1)));
+    }
+
+
+    public enum HeaderParseStatus {
+        DONE,
+        HAVE_MORE_HEADERS,
+        NEED_MORE_DATA
+    }
+
+
+    public enum HeaderParsePosition {
+        /**
+         * Start of a new header. A CRLF here means that there are no more 
headers. Any other character starts a header
+         * name.
+         */
+        HEADER_START,
+        /**
+         * Reading a header name. All characters of header are 
HTTP_TOKEN_CHAR. Header name is followed by ':'. No
+         * whitespace is allowed.<br>
+         * Any non-HTTP_TOKEN_CHAR (this includes any whitespace) encountered 
before ':' will result in the whole line
+         * being ignored.
+         */
+        HEADER_NAME,
+        /**
+         * Skipping whitespace before text of header value starts, either on 
the first line of header value (just after
+         * ':') or on subsequent lines when it is known that subsequent line 
starts with SP or HT.
+         */
+        HEADER_VALUE_START,
+        /**
+         * Reading the header value. We are inside the value. Either on the 
first line or on any subsequent line. We
+         * come into this state from HEADER_VALUE_START after the first 
non-SP/non-HT byte is encountered on the line.
+         */
+        HEADER_VALUE,
+        /**
+         * Before reading a new line of a header. Once the next byte is 
peeked, the state changes without advancing our
+         * position. The state becomes either HEADER_VALUE_START (if that 
first byte is SP or HT), or HEADER_START
+         * (otherwise).
+         */
+        HEADER_MULTI_LINE,
+        /**
+         * Reading all bytes until the next CRLF. The line is being ignored.
+         */
+        HEADER_SKIPLINE
+    }
+
+
+    private static class HeaderParseData {
+        /**
+         * The first character of the header line.
+         */
+        int lineStart = 0;
+        /**
+         * When parsing header name: first character of the header.<br>
+         * When skipping broken header line: first character of the header.<br>
+         * When parsing header value: first character after ':'.
+         */
+        int start = 0;
+        /**
+         * When parsing header name: not used (stays as 0).<br>
+         * When skipping broken header line: not used (stays as 0).<br>
+         * When parsing header value: starts as the first character after ':'. 
Then is increased as far as more bytes of
+         * the header are harvested. Bytes from buf[pos] are copied to 
buf[realPos]. Thus the string from [start] to
+         * [realPos-1] is the prepared value of the header, with whitespaces 
removed as needed.<br>
+         */
+        int realPos = 0;
+        /**
+         * When parsing header name: not used (stays as 0).<br>
+         * When skipping broken header line: last non-CR/non-LF character.<br>
+         * When parsing header value: position after the last not-LWS 
character.<br>
+         */
+        int lastSignificantChar = 0;
+        /**
+         * MB that will store the value of the header. It is null while 
parsing header name and is created after the
+         * name has been parsed.
+         */
+        MessageBytes headerValue = null;
+
+        public void recycle() {
+            lineStart = 0;
+            start = 0;
+            realPos = 0;
+            lastSignificantChar = 0;
+            headerValue = null;
+        }
+    }
+
+
+    public interface HeaderDataSource {
+        /**
+         * Read more data into the header buffer. The implementation is 
expected to determine if blocking or not
+         * blocking IO should be used.
+         *
+         * @return {@code true} if more data was added to the buffer, 
otherwise {@code false}
+         *
+         * @throws IOException If an I/O error occurred while obtaining more 
header data
+         */
+        boolean fillHeaderBuffer() throws IOException;
+
+        /**
+         * Obtain a reference to the buffer containing the header data.
+         *
+         * @return The buffer containing the header data
+         */
+        ByteBuffer getHeaderByteBuffer();
+    }
+}
diff --git a/java/org/apache/tomcat/util/http/parser/LocalStrings.properties 
b/java/org/apache/tomcat/util/http/parser/LocalStrings.properties
index 325e7da57f..07238a9424 100644
--- a/java/org/apache/tomcat/util/http/parser/LocalStrings.properties
+++ b/java/org/apache/tomcat/util/http/parser/LocalStrings.properties
@@ -45,6 +45,9 @@ http.tooManyColons=An IPv6 address may not contain more than 
2 sequential colon
 http.tooManyDoubleColons=An IPv6 address may only contain a single '::' 
sequence.
 http.tooManyHextets=The IPv6 address contains [{0}] hextets but a valid IPv6 
address may not have more than 8.
 
+httpHeaderParser.invalidCrlfNoCR=Invalid end of line sequence (No CR before LF)
+httpHeaderParser.invalidHeader=The HTTP header line [{0}] does not conform to 
RFC 9112. The request has been rejected.
+
 sf.bareitem.invalidCharacter=The invalid character [{0}] was found parsing 
when start of a bare item
 sf.base64.invalidCharacter=The [{0}] character is not valid inside a base64 
sequence
 sf.boolean.invalidCharacter=The [{0}] character is not a valid boolean value
diff --git a/java/org/apache/tomcat/util/http/parser/LocalStrings_fr.properties 
b/java/org/apache/tomcat/util/http/parser/LocalStrings_fr.properties
index a954343891..a1be4a83bd 100644
--- a/java/org/apache/tomcat/util/http/parser/LocalStrings_fr.properties
+++ b/java/org/apache/tomcat/util/http/parser/LocalStrings_fr.properties
@@ -45,6 +45,8 @@ http.tooManyColons=Une adresse IPv6 ne peut pas contenir plus 
de deux caractère
 http.tooManyDoubleColons=Une adresse IPv6 ne peut contenir qu'une seule 
séquence "::"
 http.tooManyHextets=L''adresse IPv6 contient [{0}] groupes de 4 octets mais 
une adresse IPv6 valide ne doit pas en avoir plus de 8
 
+httpHeaderParser.invalidHeader=La ligne d''en-t�te HTTP [{0}] ne respecte pas 
la RFC 7230. La requ�te a �t� rejet�e.
+
 sf.bareitem.invalidCharacter=Le caractère [{0}] invalide a été rencontré en 
début d''un objet
 sf.base64.invalidCharacter=Le caractère [{0}] est invalide dans une séquence 
base64
 sf.boolean.invalidCharacter=Le caractère [{0}] n''est pas une valeur booléene 
valide
diff --git a/java/org/apache/tomcat/util/http/parser/LocalStrings_ja.properties 
b/java/org/apache/tomcat/util/http/parser/LocalStrings_ja.properties
index b9bf412a57..abde689744 100644
--- a/java/org/apache/tomcat/util/http/parser/LocalStrings_ja.properties
+++ b/java/org/apache/tomcat/util/http/parser/LocalStrings_ja.properties
@@ -45,6 +45,8 @@ http.tooManyColons=IPv6 アドレスでは文字 : を 2 つ以上連続する
 http.tooManyDoubleColons=IPv6アドレスは単一の '::'シーケンスのみを含むことができます。
 http.tooManyHextets=IPv6 アドレスは [{0}] ヘクステットで構成されていますが、正常な IPv6 アドレスなら 8 
ヘクステット以上になりません。
 
+httpHeaderParser.invalidHeader=HTTP ヘッダーの [{0}] 行目は RFC 7230 
に準拠していません。リクエストは拒否されました。
+
 sf.bareitem.invalidCharacter=ベアアイテムの開始を解析中に無効な文字 [{0}] が見つかりました
 sf.base64.invalidCharacter=文字 [{0}] は base64 シーケンス内では無効です
 sf.boolean.invalidCharacter=文字 [{0}] は有効なブール値ではありません


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org
For additional commands, e-mail: dev-h...@tomcat.apache.org

(tomcat) 02/05: Add a common header parser for headers and trailers

Reply via email to