Hi,
Some HTML documents use non-standard attributes that are essentially URLs that
might be walked through. The attached patch allows to specify new tags and
attributes to follow.
Example of use:
./wget -nd -r -P . -A jpg \
--custom-html-attrs=div/big_img,div/med_img \
http://localhost/index.html
However:
The patch does not include tests or external documentation updates of any kind.
It does not validate tag/attribute pairs except of simple "/" checking.
I'm not a C programmer, so I'm fine if the patch is considered poor and
consequently rejected.
Thanks.
From 55f05f4f2cd835d6a125ed5ec5dfa08e5ab222ba Mon Sep 17 00:00:00 2001
From: Lyubomyr Shaydariv <->
Date: Wed, 22 Jan 2020 10:41:00 +0200
Subject: [PATCH] Add --custom-html-attrs option to support custom HTML tags
and attributes that are not a part of the HTML standard.
Example of use
Suppose to have a simple HTML document index.html available at the localhost:
<html>
<head>
<title>wget --custom-html-attrs test</title>
</head>
<body>
<div zoomed_img="big.jpg"><img src="small.jpg"/></div>
</body>
</html>
and two arbitrary images named big.jpg and small.jpg respectively.
The div element and its zoomed_img attribute are both not built-ins in wget,
so the images from the document above can be downloaded as follows:
./wget -nd -r -P . -A jpg \
--custom-html-attrs=div/zoomed_img http://localhost/index.html
---
src/html-url.c | 133 +++++++++++++++++++++++++++++++++++++++++++++----
src/init.c | 2 +
src/main.c | 3 ++
src/options.h | 1 +
4 files changed, 130 insertions(+), 9 deletions(-)
diff --git a/src/html-url.c b/src/html-url.c
index 9ed420fc..b80e4fbf 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -87,6 +87,8 @@ enum {
TAG_SOURCE
};
+static const int LAST_KNOWN_TAG_ID = TAG_SOURCE;
+
/* The list of known tags and functions used for handling them. Most
tags are simply harvested for URLs. */
static struct known_tag {
@@ -140,7 +142,7 @@ static struct known_tag {
/* For tags handled by tag_find_urls: attributes that contain URLs to
download. */
-static struct {
+static struct known_attribute {
int tagid;
const char *attr_name;
int flags;
@@ -187,6 +189,10 @@ static const char *additional_attributes[] = {
"srcset", /* used by tag_handle_img */
};
+static unsigned int custom_tag_and_attribute_count;
+static struct known_tag *custom_tags;
+static struct known_attribute *custom_attributes;
+
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
@@ -194,6 +200,85 @@ static struct hash_table *interesting_attributes;
meta tags */
static char *meta_charset;
+static void
+init_custom_tags_and_attributes (void)
+{
+ unsigned int i = 0;
+ size_t len = 0;
+ const int *next_free_tagid;
+ char **custom;
+ char *name;
+
+ if (!opt.custom_html_attrs)
+ return;
+
+ /* Count the number of recognized tag/attribute pairs first. */
+ for (i = 0, custom = opt.custom_html_attrs; *custom; custom++)
+ {
+ const char *delim_start = strchr (*custom, '/');
+ if (!delim_start)
+ ; // TODO handle missing delimiter
+ else
+ i++;
+ }
+
+ if (!i)
+ return;
+
+ custom_tag_and_attribute_count = i;
+ custom_tags = xnew_array (struct known_tag, custom_tag_and_attribute_count);
+ custom_attributes = xnew_array (struct known_attribute, custom_tag_and_attribute_count);
+
+ /* Allocate a temporary buffer of new tag IDs to refer as values
+ from the hash table below. */
+ int *tmp_free_tagids = xnew_array (int, custom_tag_and_attribute_count);
+ for (i = 0; i < custom_tag_and_attribute_count; i++)
+ tmp_free_tagids[i] = LAST_KNOWN_TAG_ID + i + 1;
+ next_free_tagid = tmp_free_tagids;
+
+ struct hash_table *tmp_tag_to_tagid = make_nocase_string_hash_table (countof (known_tags));
+ for (i = 0; i < countof (known_tags); i++)
+ hash_table_put (tmp_tag_to_tagid, known_tags[i].name, &known_tags[i].tagid);
+
+ for (i = 0, custom = opt.custom_html_attrs; *custom; custom++)
+ {
+ const char * const delim_start = strchr (*custom, '/');
+ const int * tagid;
+ if (!delim_start)
+ continue;
+
+ /* Split the pattern and take and create the left substring as
+ a tag name, and then check if a particular tag ID can be reused
+ or a new one must be created. */
+ len = delim_start - *custom;
+ name = xmemdup (*custom, len + 1);
+ name[len] = '\0';
+ tagid = hash_table_get (tmp_tag_to_tagid, name);
+ if (!tagid)
+ {
+ tagid = next_free_tagid;
+ hash_table_put (tmp_tag_to_tagid, name, tagid);
+ next_free_tagid++;
+ }
+ custom_tags[i].tagid = *tagid;
+ custom_tags[i].name = name;
+ custom_tags[i].handler = tag_find_urls;
+
+ /* Now take the right substring as a tag attribute name. */
+ len = strlen (delim_start + 1);
+ name = xmemdup (delim_start + 1, len + 1);
+ name[len] = '\0';
+ custom_attributes[i].tagid = *tagid;
+ custom_attributes[i].attr_name = name;
+ custom_attributes[i].flags = ATTR_HTML; /* Covers ATTR_INLINE anyway. */
+
+ i++;
+ }
+
+ xfree (tmp_free_tagids);
+ hash_table_destroy (tmp_tag_to_tagid);
+}
+
static void
init_interesting (void)
{
@@ -214,6 +299,9 @@ init_interesting (void)
for (i = 0; i < countof (known_tags); i++)
hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
+ for (i = 0; i < custom_tag_and_attribute_count; i++)
+ hash_table_put (interesting_tags, custom_tags[i].name, custom_tags + i);
+
/* Then remove the tags ignored through --ignore-tags. */
if (opt.ignore_tags)
{
@@ -247,6 +335,8 @@ init_interesting (void)
for (i = 0; i < countof (tag_url_attributes); i++)
hash_table_put (interesting_attributes,
tag_url_attributes[i].attr_name, "1");
+ for (i = 0; i < custom_tag_and_attribute_count; i++)
+ hash_table_put (interesting_attributes, custom_attributes[i].attr_name, "1");
}
/* Find the value of attribute named NAME in the taginfo TAG. If the
@@ -421,15 +511,20 @@ check_style_attr (struct taginfo *tag, struct map_context *ctx)
static void
tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
{
+ int is_known = tagid <= LAST_KNOWN_TAG_ID;
size_t i;
int attrind;
int first = -1;
+ const size_t size = is_known ? countof (tag_url_attributes)
+ : custom_tag_and_attribute_count;
+ const struct known_attribute *attributes = is_known ? tag_url_attributes
+ : custom_attributes;
- for (i = 0; i < countof (tag_url_attributes); i++)
- if (tag_url_attributes[i].tagid == tagid)
+ for (i = 0; i < size; i++)
+ if (attributes[i].tagid == tagid)
{
- /* We've found the index of tag_url_attributes where the
- attributes of our tag begin. */
+ /* We've found the index of tag_url_attributes or
+ custom_attributes where the attributes of our tag begin. */
first = i;
break;
}
@@ -449,22 +544,21 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
/* Find whether TAG/ATTRIND is a combination that contains a
URL. */
char *link = tag->attrs[attrind].value;
- const size_t size = countof (tag_url_attributes);
/* If you're cringing at the inefficiency of the nested loops,
remember that they both iterate over a very small number of
items. The worst-case inner loop is for the IMG tag, which
has three attributes. */
- for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
+ for (i = first; i < size && attributes[i].tagid == tagid; i++)
{
if (0 == strcasecmp (tag->attrs[attrind].name,
- tag_url_attributes[i].attr_name))
+ attributes[i].attr_name))
{
struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
ATTR_SIZE(tag,attrind), ctx);
if (up)
{
- int flags = tag_url_attributes[i].flags;
+ int flags = attributes[i].flags;
if (flags & ATTR_INLINE)
up->link_inline_p = 1;
if (flags & ATTR_HTML)
@@ -811,6 +905,9 @@ get_urls_html_fm (const char *file, const struct file_memory *fm,
ctx.document_file = file;
ctx.nofollow = false;
+ if (!custom_tags && !custom_attributes)
+ init_custom_tags_and_attributes ();
+
if (!interesting_tags)
init_interesting ();
@@ -966,6 +1063,24 @@ get_urls_file (const char *file)
void
cleanup_html_url (void)
{
+ /* Destroy the dynamic arrays for custom tags and attributes
+ allocated in the initialization routine. */
+ if (custom_tags)
+ {
+ unsigned int i;
+ for (i = 0; i < custom_tag_and_attribute_count; i++) {
+ xfree (custom_tags[i].name);
+ }
+ xfree (custom_tags);
+ }
+ if (custom_attributes)
+ {
+ unsigned int i;
+ for (i = 0; i < custom_tag_and_attribute_count; i++) {
+ xfree (custom_attributes[i].attr_name);
+ }
+ xfree (custom_attributes);
+ }
/* Destroy the hash tables. The hash table keys and values are not
allocated by this code, so we don't need to free them here. */
if (interesting_tags)
diff --git a/src/init.c b/src/init.c
index eae5391b..9145d6af 100644
--- a/src/init.c
+++ b/src/init.c
@@ -177,6 +177,7 @@ static const struct {
#ifdef HAVE_SSL
{ "crlfile", &opt.crl_file, cmd_file_once },
#endif
+ { "customhtmlattrs", &opt.custom_html_attrs, cmd_vector },
{ "cutdirs", &opt.cut_dirs, cmd_number },
{ "debug", &opt.debug, cmd_boolean },
{ "defaultpage", &opt.default_page, cmd_string },
@@ -1959,6 +1960,7 @@ cleanup (void)
free_vec (opt.exclude_domains);
free_vec (opt.follow_tags);
free_vec (opt.ignore_tags);
+ free_vec (opt.custom_html_attrs);
xfree (opt.progress_type);
xfree (opt.warc_filename);
xfree (opt.warc_tempdir);
diff --git a/src/main.c b/src/main.c
index 4d595ef0..6192cd35 100644
--- a/src/main.c
+++ b/src/main.c
@@ -298,6 +298,7 @@ static struct cmdline_option option_data[] =
{ "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
{ "cookies", 0, OPT_BOOLEAN, "cookies", -1 },
{ IF_SSL ("crl-file"), 0, OPT_VALUE, "crlfile", -1 },
+ { "custom-html-attrs", 0, OPT_VALUE, "customhtmlattrs", -1 },
{ "cut-dirs", 0, OPT_VALUE, "cutdirs", -1 },
{ "debug", 'd', OPT_BOOLEAN, "debug", -1 },
{ "default-page", 0, OPT_VALUE, "defaultpage", -1 },
@@ -1012,6 +1013,8 @@ Recursive accept/reject:\n"),
--follow-tags=LIST comma-separated list of followed HTML tags\n"),
N_("\
--ignore-tags=LIST comma-separated list of ignored HTML tags\n"),
+ N_("\
+ --custom-html-attrs=LIST comma-separated list of new followed HTML attributes\n"),
N_("\
-H, --span-hosts go to foreign hosts when recursive\n"),
N_("\
diff --git a/src/options.h b/src/options.h
index 9a02f3aa..6c7efe59 100644
--- a/src/options.h
+++ b/src/options.h
@@ -112,6 +112,7 @@ struct options
char **follow_tags; /* List of HTML tags to recursively follow. */
char **ignore_tags; /* List of HTML tags to ignore if recursing. */
+ char **custom_html_attrs; /* List of custom HTML tags and attributes delimited by '/'. */
bool follow_ftp; /* Are FTP URL-s followed in recursive
retrieving? */
--
2.25.0