On Tue, May 28, 2024 at 12:53:46PM +0100, J Carter wrote: > Hello Sergey, > > On Mon, 27 May 2024 14:21:43 +0400 > Sergey Kandaurov <pluk...@nginx.com> wrote: > > > # HG changeset patch > > # User Sergey Kandaurov <pluk...@nginx.com> > > # Date 1716805272 -14400 > > # Mon May 27 14:21:12 2024 +0400 > > # Node ID e82a7318ed48fdbc1273771bc96357e9dc232975 > > # Parent f58b6f6362387eeace46043a6fc0bceb56a6786a > > Rewritten host header validation to follow generic parsing rules. > > > > It now uses a generic model of state-based machine, with more strict > > parsing rules borrowed from ngx_http_validate_host(), > > I think you mean "borrowed from ngx_http_parse_request_line()". >
Sure, tnx. The problem is that both functions make a subset of parsing and validation, and these sets just partially intersect. In particular, ngx_http_parse_request_line() currently detects invalid characters in a port subcomponent of authority, and ngx_http_validate_host() handles a trailing dot. So I think it makes sense to make them unifined, this will also remove the need to validate host in absolute URIs. Below is an updated version with both parsers further polished (stream part is excluded for now). Also, it may have sense to rename ngx_http_validate_host() to something like ngx_http_parse_host(), similar to ngx_http_parse_uri(), out of this series. # HG changeset patch # User Sergey Kandaurov <pluk...@nginx.com> # Date 1717777582 -14400 # Fri Jun 07 20:26:22 2024 +0400 # Node ID 0cba4301e4980871de7aceb46acddf8f2b5a7318 # Parent 02e9411009b987f408214ab4a8b6b6093f843bcd Improved parsing of host in absolute URIs. When the request line is in the absolute-URI form, a host identified by a registered name (reg-name) is now restricted to start with an alphanumeric character (see RFC 1123, RFC 3986). Previously, empty domain labels or host starting with a hyphen were accepted. Additionally, host with a trailing dot is taken into account. diff --git a/src/http/ngx_http_parse.c b/src/http/ngx_http_parse.c --- a/src/http/ngx_http_parse.c +++ b/src/http/ngx_http_parse.c @@ -113,8 +113,10 @@ ngx_http_parse_request_line(ngx_http_req sw_schema_slash_slash, sw_host_start, sw_host, + sw_host_dot, sw_host_end, sw_host_ip_literal, + sw_host_ip_literal_dot, sw_port, sw_after_slash_in_uri, sw_check_uri, @@ -354,27 +356,50 @@ ngx_http_parse_request_line(ngx_http_req break; } - state = sw_host; + if (ch == '.' || ch == '-') { + return NGX_HTTP_PARSE_INVALID_REQUEST; + } /* fall through */ case sw_host: + case sw_host_dot: + + if (ch == '.') { + if (state == sw_host_dot) { + return NGX_HTTP_PARSE_INVALID_REQUEST; + } + + state = sw_host_dot; + break; + } c = (u_char) (ch | 0x20); if (c >= 'a' && c <= 'z') { + state = sw_host; + break; + } + + if ((ch >= '0' && ch <= '9') || ch == '-') { + state = sw_host; break; } - if ((ch >= '0' && ch <= '9') || ch == '.' || ch == '-') { - break; + if (state == sw_host_start) { + return NGX_HTTP_PARSE_INVALID_REQUEST; + } + + if (state == sw_host_dot) { + r->host_end = p - 1; + + } else { + r->host_end = p; } /* fall through */ case sw_host_end: - r->host_end = p; - switch (ch) { case ':': state = sw_port; @@ -404,6 +429,18 @@ ngx_http_parse_request_line(ngx_http_req break; case sw_host_ip_literal: + case sw_host_ip_literal_dot: + + if (ch == '.') { + if (state == sw_host_ip_literal_dot) { + return NGX_HTTP_PARSE_INVALID_REQUEST; + } + + state = sw_host_ip_literal_dot; + break; + } + + state = sw_host_ip_literal; if (ch >= '0' && ch <= '9') { break; @@ -418,10 +455,10 @@ ngx_http_parse_request_line(ngx_http_req case ':': break; case ']': + r->host_end = p + 1; state = sw_host_end; break; case '-': - case '.': case '_': case '~': /* unreserved */ # HG changeset patch # User Sergey Kandaurov <pluk...@nginx.com> # Date 1717777646 -14400 # Fri Jun 07 20:27:26 2024 +0400 # Node ID 3f7ac1d90a6d4eceabaa5ce45dabf53efd99ed67 # Parent 0cba4301e4980871de7aceb46acddf8f2b5a7318 Rewritten host validation to match host parsing in absolute URIs. It is reimplemented based on ngx_http_parse_request_line() state machine. This introduces several changes, in particular: - host name with underscores is rejected; - a port subcomponent is restricted to digits; - for IP literals, a missing closing bracket and trailing dot are detected. diff --git a/src/http/ngx_http_request.c b/src/http/ngx_http_request.c --- a/src/http/ngx_http_request.c +++ b/src/http/ngx_http_request.c @@ -2145,72 +2145,157 @@ ngx_int_t ngx_http_validate_host(ngx_str_t *host, ngx_pool_t *pool, ngx_uint_t alloc) { u_char *h, ch; - size_t i, dot_pos, host_len; + size_t i, host_len; enum { - sw_usual = 0, - sw_literal, - sw_rest + sw_host_start = 0, + sw_host, + sw_host_dot, + sw_host_end, + sw_host_ip_literal, + sw_host_ip_literal_dot, + sw_port, } state; - dot_pos = host->len; host_len = host->len; h = host->data; - state = sw_usual; + state = sw_host_start; for (i = 0; i < host->len; i++) { ch = h[i]; - switch (ch) { - - case '.': - if (dot_pos == i - 1) { + switch (state) { + + case sw_host_start: + + if (ch == '[') { + state = sw_host_ip_literal; + break; + } + + if (ch == '.' || ch == '-') { return NGX_DECLINED; } - dot_pos = i; - break; - - case ':': - if (state == sw_usual) { - host_len = i; - state = sw_rest; - } - break; - - case '[': - if (i == 0) { - state = sw_literal; - } - break; - - case ']': - if (state == sw_literal) { - host_len = i + 1; - state = sw_rest; - } - break; - - default: - - if (ngx_path_separator(ch)) { - return NGX_DECLINED; - } - - if (ch <= 0x20 || ch == 0x7f) { - return NGX_DECLINED; + + /* fall through */ + + case sw_host: + case sw_host_dot: + + if (ch == '.') { + if (state == sw_host_dot) { + return NGX_DECLINED; + } + + state = sw_host_dot; + break; } if (ch >= 'A' && ch <= 'Z') { alloc = 1; + state = sw_host; + break; } + if (ch >= 'a' && ch <= 'z') { + state = sw_host; + break; + } + + if ((ch >= '0' && ch <= '9') || ch == '-') { + state = sw_host; + break; + } + + if (state == sw_host_dot) { + host_len = i - 1; + + } else { + host_len = i; + } + + /* fall through */ + + case sw_host_end: + + if (ch == ':') { + state = sw_port; + break; + } + return NGX_DECLINED; + + case sw_host_ip_literal: + case sw_host_ip_literal_dot: + + if (ch == '.') { + if (state == sw_host_ip_literal_dot) { + return NGX_DECLINED; + } + + state = sw_host_ip_literal_dot; + break; + } + + state = sw_host_ip_literal; + + if (ch >= 'A' && ch <= 'Z') { + alloc = 1; + break; + } + + if (ch >= 'a' && ch <= 'z') { + break; + } + + if (ch >= '0' && ch <= '9') { + break; + } + + switch (ch) { + case ':': + break; + case ']': + host_len = i + 1; + state = sw_host_end; + break; + case '-': + case '_': + case '~': + /* unreserved */ + break; + case '!': + case '$': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case ';': + case '=': + /* sub-delims */ + break; + default: + return NGX_DECLINED; + } break; + + case sw_port: + if (ch >= '0' && ch <= '9') { + break; + } + return NGX_DECLINED; } } - if (dot_pos == host_len - 1) { + if (state == sw_host_ip_literal) { + return NGX_DECLINED; + } + + if (h[host_len - 1] == '.') { host_len--; } # HG changeset patch # User Sergey Kandaurov <pluk...@nginx.com> # Date 1717777737 -14400 # Fri Jun 07 20:28:57 2024 +0400 # Node ID 722bcffe3d3c9ff4314a2813227c47dc5eff660e # Parent 3f7ac1d90a6d4eceabaa5ce45dabf53efd99ed67 Skip host validation in absolute URIs. Now that parsing of host in the absolute-URI form and of the host header were made the same in previous changes, it makes no sense to validate host once again. Only host case normalization to lowercase is applied (RFC 3986, 6.2.2.1) after parsing absolute URI as this is out of scope. No functional changes intended. diff --git a/src/http/ngx_http_parse.c b/src/http/ngx_http_parse.c --- a/src/http/ngx_http_parse.c +++ b/src/http/ngx_http_parse.c @@ -374,8 +374,13 @@ ngx_http_parse_request_line(ngx_http_req break; } - c = (u_char) (ch | 0x20); - if (c >= 'a' && c <= 'z') { + if (ch >= 'A' && ch <= 'Z') { + r->host_normalize = 1; + state = sw_host; + break; + } + + if (ch >= 'a' && ch <= 'z') { state = sw_host; break; } @@ -446,8 +451,12 @@ ngx_http_parse_request_line(ngx_http_req break; } - c = (u_char) (ch | 0x20); - if (c >= 'a' && c <= 'z') { + if (ch >= 'A' && ch <= 'Z') { + r->host_normalize = 1; + break; + } + + if (ch >= 'a' && ch <= 'z') { break; } diff --git a/src/http/ngx_http_request.c b/src/http/ngx_http_request.c --- a/src/http/ngx_http_request.c +++ b/src/http/ngx_http_request.c @@ -1147,18 +1147,15 @@ ngx_http_process_request_line(ngx_event_ host.len = r->host_end - r->host_start; host.data = r->host_start; - rc = ngx_http_validate_host(&host, r->pool, 0); - - if (rc == NGX_DECLINED) { - ngx_log_error(NGX_LOG_INFO, c->log, 0, - "client sent invalid host in request line"); - ngx_http_finalize_request(r, NGX_HTTP_BAD_REQUEST); - break; - } - - if (rc == NGX_ERROR) { - ngx_http_close_request(r, NGX_HTTP_INTERNAL_SERVER_ERROR); - break; + if (r->host_normalize) { + host.data = ngx_pnalloc(r->pool, host.len); + if (host.data == NULL) { + ngx_http_close_request(r, + NGX_HTTP_INTERNAL_SERVER_ERROR); + break; + } + + ngx_strlow(host.data, r->host_start, host.len); } if (ngx_http_set_virtual_server(r, &host) == NGX_ERROR) { diff --git a/src/http/ngx_http_request.h b/src/http/ngx_http_request.h --- a/src/http/ngx_http_request.h +++ b/src/http/ngx_http_request.h @@ -466,6 +466,9 @@ struct ngx_http_request_s { unsigned http_state:4; + /* host with upper case */ + unsigned host_normalize:1; + /* URI with "/." and on Win32 with "//" */ unsigned complex_uri:1; _______________________________________________ nginx-devel mailing list nginx-devel@nginx.org https://mailman.nginx.org/mailman/listinfo/nginx-devel