The URL in <meta http-equiv="Refresh" content="42; URL=target.html"> can now freely contain spaces and semicolons. There cannot be other parameters between the delay and the URL. If the URL is not quoted, then it spans to the end of the attribute, except not to trailing spaces. If the URL is quoted, then it ends at the first closing quotation mark. All this is consistent with Debian Iceweasel 3.5.16. --- src/document/html/Makefile | 4 +- src/document/html/parse-meta-refresh.c | 97 ++++++++++++ src/document/html/parse-meta-refresh.h | 21 +++ src/document/html/parser.c | 170 +++------------------- src/document/html/test/Makefile | 9 + src/document/html/test/parse-meta-refresh-test.c | 174 ++++++++++++++++++++++ src/document/html/test/test-parse-meta-refresh | 3 + 7 files changed, 325 insertions(+), 153 deletions(-) create mode 100644 src/document/html/parse-meta-refresh.c create mode 100644 src/document/html/parse-meta-refresh.h create mode 100644 src/document/html/test/Makefile create mode 100644 src/document/html/test/parse-meta-refresh-test.c create mode 100755 src/document/html/test/test-parse-meta-refresh
diff --git a/src/document/html/Makefile b/src/document/html/Makefile index 5f7510b..91e7e08 100644 --- a/src/document/html/Makefile +++ b/src/document/html/Makefile @@ -1,7 +1,7 @@ top_builddir=../../.. include $(top_builddir)/Makefile.config -SUBDIRS = parser -OBJS = frames.o parser.o renderer.o tables.o +SUBDIRS = parser test +OBJS = frames.o parse-meta-refresh.o parser.o renderer.o tables.o include $(top_srcdir)/Makefile.lib diff --git a/src/document/html/parse-meta-refresh.c b/src/document/html/parse-meta-refresh.c new file mode 100644 index 0000000..b26135a --- /dev/null +++ b/src/document/html/parse-meta-refresh.c @@ -0,0 +1,97 @@ +/* Parse <meta http-equiv="refresh" content="..."> */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <stdlib.h> + +#include "elinks.h" + +#include "document/html/parse-meta-refresh.h" +#include "osdep/ascii.h" +#include "util/string.h" + +#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB) + +int +html_parse_meta_refresh(const unsigned char *content, + unsigned long *delay_out, + unsigned char **url_out) +{ + const unsigned char *end_url = NULL; + const unsigned char *scan = content; + int negative = 0; + const unsigned char *lookahead; + + *url_out = NULL; + *delay_out = 0; + + while (LWS(*scan)) + ++scan; + + if (!*scan) + return -1; + + + /* Is there something that looks vaguely like a number? */ + lookahead = scan; + if (*lookahead == '-') { + negative = 1; + ++lookahead; + } else if (*lookahead == '+') { + ++lookahead; + } + if (isdigit(*lookahead) || *lookahead == '.') { + unsigned long delay = strtoul(lookahead, NULL, 10); + + if (negative && delay != 0) + return -1; + *delay_out = delay; + + while (isdigit(*lookahead) || *lookahead == '.') + ++lookahead; + scan = lookahead; + } + + while (LWS(*scan) || *scan == ';') + ++scan; + + /* Skip "URL=" if any. With at least one equals sign, + * and optional spaces. */ + if ((scan[0] == 'U' || scan[0] == 'u') + && (scan[1] == 'R' || scan[1] == 'r') + && (scan[2] == 'L' || scan[2] == 'l')) { + lookahead = scan + 3; + + while (LWS(*lookahead)) + ++lookahead; + if (*lookahead == '=') { + while (LWS(*lookahead) || *lookahead == '=') + ++lookahead; + scan = lookahead; + } + } + + if (*scan == '"' || *scan == '\'') { + unsigned char quote = *scan++; + + end_url = strchr(scan, quote); + if (end_url == NULL) + end_url = strchr(scan, '\0'); + } else { + end_url = strchr(scan, '\0'); + while (scan < end_url && LWS(end_url[-1])) + --end_url; + } + + if (end_url == scan) + return 0; + + *url_out = memacpy(scan, end_url - scan); + if (*url_out) + return 0; + else + return -1; +} diff --git a/src/document/html/parse-meta-refresh.h b/src/document/html/parse-meta-refresh.h new file mode 100644 index 0000000..d81409d --- /dev/null +++ b/src/document/html/parse-meta-refresh.h @@ -0,0 +1,21 @@ +#ifndef EL__DOCUMENT_HTML_PARSE_META_REFRESH_H +#define EL__DOCUMENT_HTML_PARSE_META_REFRESH_H + +/** Parses a \<meta http-equiv="refresh" content="..."> element. + * + * @param[in] content + * The value of the content attribute, with entities already expanded. + * @param[out] delay + * How many seconds to wait before refreshing. + * @param[out] url + * The URI to load when refreshing, or NULL to reload the same document. + * The caller must free the string with mem_free() unless it's NULL. + * + * @return + * 0 if successful, or negative on error. + * On error, *@a url is NULL. */ +int html_parse_meta_refresh(const unsigned char *content, + unsigned long *delay, + unsigned char **url); + +#endif diff --git a/src/document/html/parser.c b/src/document/html/parser.c index d9e911a..2edc344 100644 --- a/src/document/html/parser.c +++ b/src/document/html/parser.c @@ -22,6 +22,7 @@ #include "document/css/css.h" #include "document/css/stylesheet.h" #include "document/html/frames.h" +#include "document/html/parse-meta-refresh.h" #include "document/html/parser/link.h" #include "document/html/parser/stack.h" #include "document/html/parser/parse.h" @@ -273,175 +274,42 @@ html_skip(struct html_context *html_context, unsigned char *a) html_top->type = ELEMENT_DONT_KILL; } -#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB) - -/* Parse meta refresh without URL= in it: - * <meta http-equiv="refresh" content="3,http://elinks.or.cz/"> - * <meta http-equiv="refresh" content="3; http://elinks.or.cz/"> - * <meta http-equiv="refresh" content=" 3 ; http://elinks.or.cz/ "> - */ -static void -parse_old_meta_refresh(unsigned char *str, unsigned char **ret) -{ - unsigned char *p = str; - int len; - - assert(str && ret); - if_assert_failed return; - - *ret = NULL; - while (*p && LWS(*p)) p++; - if (!*p) return; - while (*p && *p >= '0' && *p <= '9') p++; - if (!*p) return; - while (*p && LWS(*p)) p++; - if (!*p) return; - if (*p == ';' || *p == ',') p++; else return; - while (*p && LWS(*p)) p++; - if (!*p) return; - - len = strlen(p); - while (len && LWS(p[len])) len--; - if (len) *ret = memacpy(p, len); -} - -/* Search for the url part in the content attribute and returns - * it if found. - * It searches the first occurence of 'url' marker somewhere ignoring - * anything before it. - * It should cope with most situations including: - * content="0; URL='http://www.site.com/path/xxx.htm'" - * content="0 url=http://www.site.com/path/xxx.htm" - * content="anything ; some url === ''''http://www.site.com/path/xxx.htm'''' - * - * The return value is one of: - * - * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret. - * - HEADER_PARAM_NOT_FOUND: the parameter is not there. *@ret is now NULL. - * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL. - * - * If @ret is NULL, then this function doesn't actually access *@ret, - * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY. Some callers may - * rely on this. */ -static enum parse_header_param -search_for_url_param(unsigned char *str, unsigned char **ret) -{ - unsigned char *p; - int plen = 0; - - if (ret) *ret = NULL; /* default in case of early return */ - - assert(str); - if_assert_failed return HEADER_PARAM_NOT_FOUND; - - /* Returns now if string @str is empty. */ - if (!*str) return HEADER_PARAM_NOT_FOUND; - - p = c_strcasestr(str, "url"); - if (!p) return HEADER_PARAM_NOT_FOUND; - p += 3; - - while (*p && (*p <= ' ' || *p == '=')) p++; - if (!*p) { - if (ret) { - *ret = stracpy(""); - if (!*ret) - return HEADER_PARAM_OUT_OF_MEMORY; - } - return HEADER_PARAM_FOUND; - } - - while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++; - - /* Trim ending spaces */ - while (plen > 0 && LWS(p[plen - 1])) plen--; - - /* XXX: Drop enclosing single quotes if there's some. - * - * Some websites like newsnow.co.uk are using single quotes around url - * in URL field in meta tag content attribute like this: - * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'"> - * - * This is an attempt to handle that, but it may break something else. - * We drop all pair of enclosing quotes found (eg. '''url''' => url). - * Please report any issue related to this. --Zas */ - while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') { - p++; - plen -= 2; - } - - if (ret) { - *ret = memacpy(p, plen); - if (!*ret) - return HEADER_PARAM_OUT_OF_MEMORY; - } - return HEADER_PARAM_FOUND; -} - -#undef LWS - static void check_head_for_refresh(struct html_context *html_context, unsigned char *head) { - unsigned char *refresh, *url; + unsigned char *refresh; + unsigned char *url = NULL; + unsigned char *joined_url = NULL; + unsigned long seconds; refresh = parse_header(head, "Refresh", NULL); if (!refresh) return; - search_for_url_param(refresh, &url); - if (!url) { - /* Let's try a more tolerant parsing. */ - parse_old_meta_refresh(refresh, &url); + if (html_parse_meta_refresh(refresh, &seconds, &url) == 0) { if (!url) { /* If the URL parameter is missing assume that the * document being processed should be refreshed. */ - url = get_uri_string(html_context->base_href, URI_ORIGINAL); + url = get_uri_string(html_context->base_href, + URI_ORIGINAL); } } - if (url) { - /* Extraction of refresh time. */ - unsigned long seconds = 0; - int valid = 1; - - /* We try to extract the refresh time, and to handle weird things - * in an elegant way. Among things we can have negative values, - * too big ones, just ';' (we assume 0 seconds in that case) and - * more. */ - if (*refresh != ';') { - if (isdigit(*refresh)) { - unsigned long max_seconds = HTTP_REFRESH_MAX_DELAY; - - errno = 0; - seconds = strtoul(refresh, NULL, 10); - if (errno == ERANGE || seconds > max_seconds) { - /* Too big refresh value, limit it. */ - seconds = max_seconds; - } else if (errno) { - /* Bad syntax */ - valid = 0; - } - } else { - /* May be a negative number, or some bad syntax. */ - valid = 0; - } - } - - if (valid) { - unsigned char *joined_url = join_urls(html_context->base_href, url); - - html_focusable(html_context, NULL); + if (url) + joined_url = join_urls(html_context->base_href, url); - put_link_line("Refresh: ", url, joined_url, - html_context->options->framename, html_context); - html_context->special_f(html_context, SP_REFRESH, seconds, joined_url); + if (joined_url) { + if (seconds > HTTP_REFRESH_MAX_DELAY) + seconds = HTTP_REFRESH_MAX_DELAY; - mem_free(joined_url); - } + html_focusable(html_context, NULL); - mem_free(url); + put_link_line("Refresh: ", url, joined_url, + html_context->options->framename, html_context); + html_context->special_f(html_context, SP_REFRESH, seconds, joined_url); } + mem_free_if(joined_url); + mem_free_if(url); mem_free(refresh); } diff --git a/src/document/html/test/Makefile b/src/document/html/test/Makefile new file mode 100644 index 0000000..c5363bf --- /dev/null +++ b/src/document/html/test/Makefile @@ -0,0 +1,9 @@ +top_builddir=../../../.. +include $(top_builddir)/Makefile.config + +SUBDIRS = +TEST_PROGS = parse-meta-refresh-test +TESTDEPS += \ + $(top_builddir)/src/document/html/parse-meta-refresh.o + +include $(top_srcdir)/Makefile.lib diff --git a/src/document/html/test/parse-meta-refresh-test.c b/src/document/html/test/parse-meta-refresh-test.c new file mode 100644 index 0000000..8c02bff --- /dev/null +++ b/src/document/html/test/parse-meta-refresh-test.c @@ -0,0 +1,174 @@ +/* Test parsing of <meta http-equiv="refresh" content="..."> */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "elinks.h" + +#include "document/html/parse-meta-refresh.h" +#include "util/memory.h" + +struct meta_refresh_test_case +{ + const unsigned char *content; + int error; + unsigned long delay; + const unsigned char *url; +}; + +static const struct meta_refresh_test_case meta_refresh_test_cases[] = { + /* delay only */ + { "42", + 0, 42, NULL }, + { "0", + 0, 0, NULL }, + { " 5 ", + 0, 5, NULL }, + { "9999999999999999999999999", + 0, ULONG_MAX, NULL }, + { "69 ; ", + 0, 69, NULL }, + { "105;", + 0, 105, NULL }, + { "", + -1, 0, NULL }, + + /* simple */ + { "42; URL=file:///dir/file.html", + 0, 42, "file:///dir/file.html" }, + { "42; URL='file:///dir/file.html'", + 0, 42, "file:///dir/file.html" }, + { "42; URL=\"file:///dir/file.html\"", + 0, 42, "file:///dir/file.html" }, + + /* without URL= */ + { "9; file:///dir/file.html", + 0, 9, "file:///dir/file.html" }, + { "9; 'file:///dir/file.html'", + 0, 9, "file:///dir/file.html" }, + { "9; \"file:///dir/file.html\"", + 0, 9, "file:///dir/file.html" }, + + /* lower case */ + { "3; Url=\"file:///dir/file.html\"", + 0, 3, "file:///dir/file.html" }, + { "3; url=\"file:///dir/file.html\"", + 0, 3, "file:///dir/file.html" }, + + /* unusual delimiters */ + { "0 URL=\"file:///dir/file.html\"", + 0, 0, "file:///dir/file.html" }, + { "0 ; URL = \"file:///dir/file.html\"", + 0, 0, "file:///dir/file.html" }, + + /* semicolons in the URL */ + { "3; URL=file:///dir/file.cgi?a=1;b=2;c=3", + 0, 3, "file:///dir/file.cgi?a=1;b=2;c=3" }, + { "3; URL=\"file:///dir/file.cgi?a=1;b=2;c=3\"", + 0, 3, "file:///dir/file.cgi?a=1;b=2;c=3" }, + + /* spaces in the URL */ + { "3; URL=\"file:///dir/file.cgi?phrase=Hello, world!\"", + 0, 3, "file:///dir/file.cgi?phrase=Hello, world!" }, + { "3; URL=\"file:///dir/file.cgi?phrase=Hello, world! \"", + 0, 3, "file:///dir/file.cgi?phrase=Hello, world! " }, + { "3; URL=file:///dir/file.cgi?phrase=Hello, world!", + 0, 3, "file:///dir/file.cgi?phrase=Hello, world!" }, + + /* "URL" in the URL */ + { "0; URL=file:///dir/xlat.cgi?url=http://example.org/&lang=cu", + 0, 0, "file:///dir/xlat.cgi?url=http://example.org/&lang=cu" }, + { "0; file:///dir/xlat.cgi?url=http://example.org/&lang=cu", + 0, 0, "file:///dir/xlat.cgi?url=http://example.org/&lang=cu" }, + + /* unusual delays */ + { "; URL=\"file:///dir/file.html\"", + 0, 0, "file:///dir/file.html" }, + { "2.99999; file:///dir/file.html", + 0, 2, "file:///dir/file.html" }, + { "2.99999; 'file:///dir/file.html'", + 0, 2, "file:///dir/file.html" }, + { "040; URL='file:///dir/file.html'", + 0, 40, "file:///dir/file.html" }, + { "-1; URL='file:///dir/file.html'", + -1, 0, NULL }, + { "-2; URL='file:///dir/file.html'", + -1, 0, NULL }, + { " 2; URL='file:///dir/file.html'", + 0, 2, "file:///dir/file.html" }, + { "9999999999999999999999999; URL='file:///dir/file.html'", + 0, ULONG_MAX, "file:///dir/file.html" }, + + /* other stuff after the URL */ + { "5; URL=file:///dir/file.html ", + 0, 5, "file:///dir/file.html" }, + { "5; URL=file:///dir/file.html\t", + 0, 5, "file:///dir/file.html" }, + { "5; URL=\"file:///dir/file.html\" ", + 0, 5, "file:///dir/file.html" }, + { "5; URL=\"file:///dir/file.html\"\t\t", + 0, 5, "file:///dir/file.html" }, + { "5; URL=\"file:///dir/file.html\" ; ", + 0, 5, "file:///dir/file.html" }, + { "5; URL=\"file:///dir/file.html\"; transition=\"sweep\"", + 0, 5, "file:///dir/file.html" }, + + /* sentinel */ + { NULL, 0, 0, NULL } +}; + +int +main(void) +{ + const struct meta_refresh_test_case *test; + int count_ok = 0; + int count_fail = 0; + + for (test = meta_refresh_test_cases; test->content; test++) { + static unsigned char dummy[] = "dummy"; + unsigned long delay = 21; + unsigned char *url = dummy; + + int error = html_parse_meta_refresh(test->content, + &delay, &url); + if (error < 0 && test->error < 0 && url == NULL) { + /* Test OK */ + count_ok++; + } else if (error >= 0 && test->error >= 0 + && ((!url && !test->url) + || (url && test->url && !strcmp(url, test->url))) + && delay == test->delay) { + /* Test OK */ + count_ok++; + } else { + fprintf(stderr, "Test failed at input: %s\n" + "\tParsed error: %d\n" + "\tCorrect error: %d\n" + "\tParsed delay: %lu\n" + "\tCorrect delay: %lu\n" + "\tParsed URL: %s\n" + "\tCorrect URL: %s\n", + test->content, + error, + test->error, + delay, + test->delay, + url ? (char *) url : "(null)", + test->url ? (char *) test->url : "(null)"); + count_fail++; + } + + if (url != dummy && url != NULL) + mem_free(url); + } + + printf("Summary of meta refresh tests: %d OK, %d failed.\n", + count_ok, count_fail); + return count_fail ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/src/document/html/test/test-parse-meta-refresh b/src/document/html/test/test-parse-meta-refresh new file mode 100755 index 0000000..17aec66 --- /dev/null +++ b/src/document/html/test/test-parse-meta-refresh @@ -0,0 +1,3 @@ +#! /bin/sh -e + +./parse-meta-refresh-test -- 1.7.2.5
pgpcTUjSFMQIN.pgp
Description: PGP signature
_______________________________________________ elinks-dev mailing list elinks-dev@linuxfromscratch.org http://linuxfromscratch.org/mailman/listinfo/elinks-dev