[elinks-dev] [0.12 PATCH] HTML: Rewrite parsing of meta refresh

Kalle Olavi Niemitalo Wed, 27 Apr 2011 09:15:37 -0700

The URL in <meta http-equiv="Refresh" content="42; URL=target.html">
can now freely contain spaces and semicolons.  There cannot be other
parameters between the delay and the URL.  If the URL is not quoted,
then it spans to the end of the attribute, except not to trailing
spaces.  If the URL is quoted, then it ends at the first closing
quotation mark.  All this is consistent with Debian Iceweasel 3.5.16.
---
 src/document/html/Makefile                       |    4 +-
 src/document/html/parse-meta-refresh.c           |   97 ++++++++++++
 src/document/html/parse-meta-refresh.h           |   21 +++
 src/document/html/parser.c                       |  170 +++-------------------
 src/document/html/test/Makefile                  |    9 +
 src/document/html/test/parse-meta-refresh-test.c |  174 ++++++++++++++++++++++
 src/document/html/test/test-parse-meta-refresh   |    3 +
 7 files changed, 325 insertions(+), 153 deletions(-)
 create mode 100644 src/document/html/parse-meta-refresh.c
 create mode 100644 src/document/html/parse-meta-refresh.h
 create mode 100644 src/document/html/test/Makefile
 create mode 100644 src/document/html/test/parse-meta-refresh-test.c
 create mode 100755 src/document/html/test/test-parse-meta-refresh


diff --git a/src/document/html/Makefile b/src/document/html/Makefile
index 5f7510b..91e7e08 100644
--- a/src/document/html/Makefile
+++ b/src/document/html/Makefile
@@ -1,7 +1,7 @@
 top_builddir=../../..
 include $(top_builddir)/Makefile.config
 
-SUBDIRS = parser
-OBJS   = frames.o parser.o renderer.o tables.o
+SUBDIRS = parser test
+OBJS   = frames.o parse-meta-refresh.o parser.o renderer.o tables.o
 
 include $(top_srcdir)/Makefile.lib
diff --git a/src/document/html/parse-meta-refresh.c 
b/src/document/html/parse-meta-refresh.c
new file mode 100644
index 0000000..b26135a
--- /dev/null
+++ b/src/document/html/parse-meta-refresh.c
@@ -0,0 +1,97 @@
+/* Parse <meta http-equiv="refresh" content="..."> */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "elinks.h"
+
+#include "document/html/parse-meta-refresh.h"
+#include "osdep/ascii.h"
+#include "util/string.h"
+
+#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
+
+int
+html_parse_meta_refresh(const unsigned char *content,
+                       unsigned long *delay_out,
+                       unsigned char **url_out)
+{
+       const unsigned char *end_url = NULL;
+       const unsigned char *scan = content;
+       int negative = 0;
+       const unsigned char *lookahead;
+
+       *url_out = NULL;
+       *delay_out = 0;
+
+       while (LWS(*scan))
+               ++scan;
+
+       if (!*scan)
+               return -1;
+
+
+       /* Is there something that looks vaguely like a number?  */
+       lookahead = scan;
+       if (*lookahead == '-') {
+               negative = 1;
+               ++lookahead;
+       } else if (*lookahead == '+') {
+               ++lookahead;
+       }
+       if (isdigit(*lookahead) || *lookahead == '.') {
+               unsigned long delay = strtoul(lookahead, NULL, 10);
+
+               if (negative && delay != 0)
+                       return -1;
+               *delay_out = delay;
+
+               while (isdigit(*lookahead) || *lookahead == '.')
+                       ++lookahead;
+               scan = lookahead;
+       }
+
+       while (LWS(*scan) || *scan == ';')
+               ++scan;
+
+       /* Skip "URL=" if any.  With at least one equals sign,
+        * and optional spaces.  */
+       if ((scan[0] == 'U' || scan[0] == 'u')
+           && (scan[1] == 'R' || scan[1] == 'r')
+           && (scan[2] == 'L' || scan[2] == 'l')) {
+               lookahead = scan + 3;
+
+               while (LWS(*lookahead))
+                       ++lookahead;
+               if (*lookahead == '=') {
+                       while (LWS(*lookahead) || *lookahead == '=')
+                               ++lookahead;
+                       scan = lookahead;
+               }
+       }
+
+       if (*scan == '"' || *scan == '\'') {
+               unsigned char quote = *scan++;
+
+               end_url = strchr(scan, quote);
+               if (end_url == NULL)
+                       end_url = strchr(scan, '\0');
+       } else {
+               end_url = strchr(scan, '\0');
+               while (scan < end_url && LWS(end_url[-1]))
+                       --end_url;
+       }
+
+       if (end_url == scan)
+               return 0;
+
+       *url_out = memacpy(scan, end_url - scan);
+       if (*url_out)
+               return 0;
+       else
+               return -1;
+}
diff --git a/src/document/html/parse-meta-refresh.h 
b/src/document/html/parse-meta-refresh.h
new file mode 100644
index 0000000..d81409d
--- /dev/null
+++ b/src/document/html/parse-meta-refresh.h
@@ -0,0 +1,21 @@
+#ifndef EL__DOCUMENT_HTML_PARSE_META_REFRESH_H
+#define EL__DOCUMENT_HTML_PARSE_META_REFRESH_H
+
+/** Parses a \<meta http-equiv="refresh" content="..."> element.
+ *
+ * @param[in] content
+ *   The value of the content attribute, with entities already expanded.
+ * @param[out] delay
+ *   How many seconds to wait before refreshing.
+ * @param[out] url
+ *   The URI to load when refreshing, or NULL to reload the same document.
+ *   The caller must free the string with mem_free() unless it's NULL.
+ *
+ * @return
+ *   0 if successful, or negative on error.
+ *   On error, *@a url is NULL.  */
+int html_parse_meta_refresh(const unsigned char *content,
+                           unsigned long *delay,
+                           unsigned char **url);
+
+#endif
diff --git a/src/document/html/parser.c b/src/document/html/parser.c
index d9e911a..2edc344 100644
--- a/src/document/html/parser.c
+++ b/src/document/html/parser.c
@@ -22,6 +22,7 @@
 #include "document/css/css.h"
 #include "document/css/stylesheet.h"
 #include "document/html/frames.h"
+#include "document/html/parse-meta-refresh.h"
 #include "document/html/parser/link.h"
 #include "document/html/parser/stack.h"
 #include "document/html/parser/parse.h"
@@ -273,175 +274,42 @@ html_skip(struct html_context *html_context, unsigned 
char *a)
        html_top->type = ELEMENT_DONT_KILL;
 }
 
-#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
-
-/* Parse meta refresh without URL= in it:
- *  <meta http-equiv="refresh" content="3,http://elinks.or.cz/";>
- *  <meta http-equiv="refresh" content="3; http://elinks.or.cz/";>
- *  <meta http-equiv="refresh" content="   3 ;   http://elinks.or.cz/    ">
- */
-static void
-parse_old_meta_refresh(unsigned char *str, unsigned char **ret)
-{
-       unsigned char *p = str;
-       int len;
-
-       assert(str && ret);
-       if_assert_failed return;
-
-       *ret = NULL;
-       while (*p && LWS(*p)) p++;
-       if (!*p) return;
-       while (*p && *p >= '0' && *p <= '9') p++;
-       if (!*p) return;
-       while (*p && LWS(*p)) p++;
-       if (!*p) return;
-       if (*p == ';' || *p == ',') p++; else return;
-       while (*p && LWS(*p)) p++;
-       if (!*p) return;
-
-       len = strlen(p);
-       while (len && LWS(p[len])) len--;
-       if (len) *ret = memacpy(p, len);
-}
-
-/* Search for the url part in the content attribute and returns
- * it if found.
- * It searches the first occurence of 'url' marker somewhere ignoring
- * anything before it.
- * It should cope with most situations including:
- * content="0; URL='http://www.site.com/path/xxx.htm'"
- * content="0  url=http://www.site.com/path/xxx.htm";
- * content="anything ; some url  ===   ''''http://www.site.com/path/xxx.htm''''
- *
- * The return value is one of:
- *
- * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
- * - HEADER_PARAM_NOT_FOUND: the parameter is not there.  *@ret is now NULL.
- * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
- *
- * If @ret is NULL, then this function doesn't actually access *@ret,
- * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY.  Some callers may
- * rely on this. */
-static enum parse_header_param
-search_for_url_param(unsigned char *str, unsigned char **ret)
-{
-       unsigned char *p;
-       int plen = 0;
-
-       if (ret) *ret = NULL;   /* default in case of early return */
-
-       assert(str);
-       if_assert_failed return HEADER_PARAM_NOT_FOUND;
-
-       /* Returns now if string @str is empty. */
-       if (!*str) return HEADER_PARAM_NOT_FOUND;
-
-       p = c_strcasestr(str, "url");
-       if (!p) return HEADER_PARAM_NOT_FOUND;
-       p += 3;
-
-       while (*p && (*p <= ' ' || *p == '=')) p++;
-       if (!*p) {
-               if (ret) {
-                       *ret = stracpy("");
-                       if (!*ret)
-                               return HEADER_PARAM_OUT_OF_MEMORY;
-               }
-               return HEADER_PARAM_FOUND;
-       }
-
-       while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++;
-
-       /* Trim ending spaces */
-       while (plen > 0 && LWS(p[plen - 1])) plen--;
-
-       /* XXX: Drop enclosing single quotes if there's some.
-        *
-        * Some websites like newsnow.co.uk are using single quotes around url
-        * in URL field in meta tag content attribute like this:
-        * <meta http-equiv="Refresh" content="0; 
URL='http://www.site.com/path/xxx.htm'">
-        *
-        * This is an attempt to handle that, but it may break something else.
-        * We drop all pair of enclosing quotes found (eg. '''url''' => url).
-        * Please report any issue related to this. --Zas */
-       while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') {
-               p++;
-               plen -= 2;
-       }
-
-       if (ret) {
-               *ret = memacpy(p, plen);
-               if (!*ret)
-                       return HEADER_PARAM_OUT_OF_MEMORY;
-       }
-       return HEADER_PARAM_FOUND;
-}
-
-#undef LWS
-
 static void
 check_head_for_refresh(struct html_context *html_context, unsigned char *head)
 {
-       unsigned char *refresh, *url;
+       unsigned char *refresh;
+       unsigned char *url = NULL;
+       unsigned char *joined_url = NULL;
+       unsigned long seconds;
 
        refresh = parse_header(head, "Refresh", NULL);
        if (!refresh) return;
 
-       search_for_url_param(refresh, &url);
-       if (!url) {
-               /* Let's try a more tolerant parsing. */
-               parse_old_meta_refresh(refresh, &url);
+       if (html_parse_meta_refresh(refresh, &seconds, &url) == 0) {
                if (!url) {
                        /* If the URL parameter is missing assume that the
                         * document being processed should be refreshed. */
-                       url = get_uri_string(html_context->base_href, 
URI_ORIGINAL);
+                       url = get_uri_string(html_context->base_href,
+                                            URI_ORIGINAL);
                }
        }
 
-       if (url) {
-               /* Extraction of refresh time. */
-               unsigned long seconds = 0;
-               int valid = 1;
-
-               /* We try to extract the refresh time, and to handle weird 
things
-                * in an elegant way. Among things we can have negative values,
-                * too big ones, just ';' (we assume 0 seconds in that case) and
-                * more. */
-               if (*refresh != ';') {
-                       if (isdigit(*refresh)) {
-                               unsigned long max_seconds = 
HTTP_REFRESH_MAX_DELAY;
-
-                               errno = 0;
-                               seconds = strtoul(refresh, NULL, 10);
-                               if (errno == ERANGE || seconds > max_seconds) {
-                                       /* Too big refresh value, limit it. */
-                                       seconds = max_seconds;
-                               } else if (errno) {
-                                       /* Bad syntax */
-                                       valid = 0;
-                               }
-                       } else {
-                               /* May be a negative number, or some bad 
syntax. */
-                               valid = 0;
-                       }
-               }
-
-               if (valid) {
-                       unsigned char *joined_url = 
join_urls(html_context->base_href, url);
-
-                       html_focusable(html_context, NULL);
+       if (url)
+               joined_url = join_urls(html_context->base_href, url);
 
-                       put_link_line("Refresh: ", url, joined_url,
-                                     html_context->options->framename, 
html_context);
-                       html_context->special_f(html_context, SP_REFRESH, 
seconds, joined_url);
+       if (joined_url) {
+               if (seconds > HTTP_REFRESH_MAX_DELAY)
+                       seconds = HTTP_REFRESH_MAX_DELAY;
 
-                       mem_free(joined_url);
-               }
+               html_focusable(html_context, NULL);
 
-               mem_free(url);
+               put_link_line("Refresh: ", url, joined_url,
+                             html_context->options->framename, html_context);
+               html_context->special_f(html_context, SP_REFRESH, seconds, 
joined_url);
        }
 
+       mem_free_if(joined_url);
+       mem_free_if(url);
        mem_free(refresh);
 }
 
diff --git a/src/document/html/test/Makefile b/src/document/html/test/Makefile
new file mode 100644
index 0000000..c5363bf
--- /dev/null
+++ b/src/document/html/test/Makefile
@@ -0,0 +1,9 @@
+top_builddir=../../../..
+include $(top_builddir)/Makefile.config
+
+SUBDIRS = 
+TEST_PROGS = parse-meta-refresh-test
+TESTDEPS += \
+ $(top_builddir)/src/document/html/parse-meta-refresh.o
+
+include $(top_srcdir)/Makefile.lib
diff --git a/src/document/html/test/parse-meta-refresh-test.c 
b/src/document/html/test/parse-meta-refresh-test.c
new file mode 100644
index 0000000..8c02bff
--- /dev/null
+++ b/src/document/html/test/parse-meta-refresh-test.c
@@ -0,0 +1,174 @@
+/* Test parsing of <meta http-equiv="refresh" content="..."> */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "elinks.h"
+
+#include "document/html/parse-meta-refresh.h"
+#include "util/memory.h"
+
+struct meta_refresh_test_case
+{
+       const unsigned char *content;
+       int error;
+       unsigned long delay;
+       const unsigned char *url;
+};
+
+static const struct meta_refresh_test_case meta_refresh_test_cases[] = {
+       /* delay only */
+       { "42",
+         0, 42, NULL },
+       { "0",
+         0, 0, NULL },
+       { "   5   ",
+         0, 5, NULL },
+       { "9999999999999999999999999",
+         0, ULONG_MAX, NULL },
+       { "69 ; ",
+         0, 69, NULL },
+       { "105;",
+         0, 105, NULL },
+       { "",
+         -1, 0, NULL },
+
+       /* simple */
+       { "42; URL=file:///dir/file.html",
+         0, 42, "file:///dir/file.html" },
+       { "42; URL='file:///dir/file.html'",
+         0, 42, "file:///dir/file.html" },
+       { "42; URL=\"file:///dir/file.html\"",
+         0, 42, "file:///dir/file.html" },
+
+       /* without URL= */
+       { "9; file:///dir/file.html",
+         0, 9, "file:///dir/file.html" },
+       { "9; 'file:///dir/file.html'",
+         0, 9, "file:///dir/file.html" },
+       { "9; \"file:///dir/file.html\"",
+         0, 9, "file:///dir/file.html" },
+
+       /* lower case */
+       { "3; Url=\"file:///dir/file.html\"",
+         0, 3, "file:///dir/file.html" },
+       { "3; url=\"file:///dir/file.html\"",
+         0, 3, "file:///dir/file.html" },
+
+       /* unusual delimiters */
+       { "0 URL=\"file:///dir/file.html\"",
+         0, 0, "file:///dir/file.html" },
+       { "0  ;  URL  =  \"file:///dir/file.html\"",
+         0, 0, "file:///dir/file.html" },
+
+       /* semicolons in the URL */
+       { "3; URL=file:///dir/file.cgi?a=1;b=2;c=3",
+         0, 3, "file:///dir/file.cgi?a=1;b=2;c=3" },
+       { "3; URL=\"file:///dir/file.cgi?a=1;b=2;c=3\"",
+         0, 3, "file:///dir/file.cgi?a=1;b=2;c=3" },
+
+       /* spaces in the URL */
+       { "3; URL=\"file:///dir/file.cgi?phrase=Hello, world!\"",
+         0, 3, "file:///dir/file.cgi?phrase=Hello, world!" },
+       { "3; URL=\"file:///dir/file.cgi?phrase=Hello, world!  \"",
+         0, 3, "file:///dir/file.cgi?phrase=Hello, world!  " },
+       { "3; URL=file:///dir/file.cgi?phrase=Hello, world!",
+         0, 3, "file:///dir/file.cgi?phrase=Hello, world!" },
+
+       /* "URL" in the URL */
+       { "0; URL=file:///dir/xlat.cgi?url=http://example.org/&lang=cu";,
+         0, 0, "file:///dir/xlat.cgi?url=http://example.org/&lang=cu"; },
+       { "0; file:///dir/xlat.cgi?url=http://example.org/&lang=cu";,
+         0, 0, "file:///dir/xlat.cgi?url=http://example.org/&lang=cu"; },
+
+       /* unusual delays */
+       { "; URL=\"file:///dir/file.html\"",
+         0, 0, "file:///dir/file.html" },
+       { "2.99999; file:///dir/file.html",
+         0, 2, "file:///dir/file.html" },
+       { "2.99999; 'file:///dir/file.html'",
+         0, 2, "file:///dir/file.html" },
+       { "040; URL='file:///dir/file.html'",
+         0, 40, "file:///dir/file.html" },
+       { "-1; URL='file:///dir/file.html'",
+         -1, 0, NULL },
+       { "-2; URL='file:///dir/file.html'",
+         -1, 0, NULL },
+       { "  2; URL='file:///dir/file.html'",
+         0, 2, "file:///dir/file.html" },
+       { "9999999999999999999999999; URL='file:///dir/file.html'",
+         0, ULONG_MAX, "file:///dir/file.html" },
+
+       /* other stuff after the URL */
+       { "5; URL=file:///dir/file.html   ",
+         0, 5, "file:///dir/file.html" },
+       { "5; URL=file:///dir/file.html\t",
+         0, 5, "file:///dir/file.html" },
+       { "5; URL=\"file:///dir/file.html\"  ",
+         0, 5, "file:///dir/file.html" },
+       { "5; URL=\"file:///dir/file.html\"\t\t",
+         0, 5, "file:///dir/file.html" },
+       { "5; URL=\"file:///dir/file.html\" ; ",
+         0, 5, "file:///dir/file.html" },
+       { "5; URL=\"file:///dir/file.html\"; transition=\"sweep\"",
+         0, 5, "file:///dir/file.html" },
+
+       /* sentinel */
+       { NULL, 0, 0, NULL }
+};
+
+int
+main(void)
+{
+       const struct meta_refresh_test_case *test;
+       int count_ok = 0;
+       int count_fail = 0;
+
+       for (test = meta_refresh_test_cases; test->content; test++) {
+               static unsigned char dummy[] = "dummy";
+               unsigned long delay = 21;
+               unsigned char *url = dummy;
+               
+               int error = html_parse_meta_refresh(test->content,
+                                                   &delay, &url);
+               if (error < 0 && test->error < 0 && url == NULL) {
+                       /* Test OK */
+                       count_ok++;
+               } else if (error >= 0 && test->error >= 0
+                          && ((!url && !test->url)
+                              || (url && test->url && !strcmp(url, test->url)))
+                          && delay == test->delay) {
+                       /* Test OK */
+                       count_ok++;
+               } else {
+                       fprintf(stderr, "Test failed at input: %s\n"
+                               "\tParsed  error: %d\n"
+                               "\tCorrect error: %d\n"
+                               "\tParsed  delay: %lu\n"
+                               "\tCorrect delay: %lu\n"
+                               "\tParsed  URL: %s\n"
+                               "\tCorrect URL: %s\n",
+                               test->content,
+                               error,
+                               test->error,
+                               delay,
+                               test->delay,
+                               url ? (char *) url : "(null)",
+                               test->url ? (char *) test->url : "(null)");
+                       count_fail++;
+               }
+
+               if (url != dummy && url != NULL)
+                       mem_free(url);
+       }
+
+       printf("Summary of meta refresh tests: %d OK, %d failed.\n",
+              count_ok, count_fail);
+       return count_fail ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/src/document/html/test/test-parse-meta-refresh 
b/src/document/html/test/test-parse-meta-refresh
new file mode 100755
index 0000000..17aec66
--- /dev/null
+++ b/src/document/html/test/test-parse-meta-refresh
@@ -0,0 +1,3 @@
+#! /bin/sh -e
+
+./parse-meta-refresh-test
-- 
1.7.2.5

pgpcTUjSFMQIN.pgp
Description: PGP signature

_______________________________________________
elinks-dev mailing list
elinks-dev@linuxfromscratch.org
http://linuxfromscratch.org/mailman/listinfo/elinks-dev

[elinks-dev] [0.12 PATCH] HTML: Rewrite parsing of meta refresh

Reply via email to