Add --custom-html-attrs option to support custom HTML tags and attributes

Lyubomyr Shaydariv Thu, 23 Jan 2020 09:26:19 -0800

Hi,

Some HTML documents use non-standard attributes that are essentially URLs that 
might be walked through. The attached patch allows to specify new tags and 
attributes to follow.


Example of use:

./wget -nd -r -P . -A jpg \
    --custom-html-attrs=div/big_img,div/med_img \
    http://localhost/index.html

However:
The patch does not include tests or external documentation updates of any kind. 
It does not validate tag/attribute pairs except of simple "/" checking. 
I'm not a C programmer, so I'm fine if the patch is considered poor and 
consequently rejected.
Thanks.

From 55f05f4f2cd835d6a125ed5ec5dfa08e5ab222ba Mon Sep 17 00:00:00 2001
From: Lyubomyr Shaydariv <->
Date: Wed, 22 Jan 2020 10:41:00 +0200
Subject: [PATCH] Add --custom-html-attrs option to support custom HTML tags
 and attributes that are not a part of the HTML standard.

Example of use

Suppose to have a simple HTML document index.html available at the localhost:

  <html>
  <head>
  <title>wget --custom-html-attrs test</title>
  </head>
  <body>
  <div zoomed_img="big.jpg"><img src="small.jpg"/></div>
  </body>
  </html>

and two arbitrary images named big.jpg and small.jpg respectively.

The div element and its zoomed_img attribute are both not built-ins in wget,
so the images from the document above can be downloaded as follows:

  ./wget -nd -r -P . -A jpg \
    --custom-html-attrs=div/zoomed_img http://localhost/index.html
---
 src/html-url.c | 133 +++++++++++++++++++++++++++++++++++++++++++++----
 src/init.c     |   2 +
 src/main.c     |   3 ++
 src/options.h  |   1 +
 4 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/src/html-url.c b/src/html-url.c
index 9ed420fc..b80e4fbf 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -87,6 +87,8 @@ enum {
   TAG_SOURCE
 };
 
+static const int LAST_KNOWN_TAG_ID = TAG_SOURCE;
+
 /* The list of known tags and functions used for handling them.  Most
    tags are simply harvested for URLs. */
 static struct known_tag {
@@ -140,7 +142,7 @@ static struct known_tag {
 
 /* For tags handled by tag_find_urls: attributes that contain URLs to
    download. */
-static struct {
+static struct known_attribute {
   int tagid;
   const char *attr_name;
   int flags;
@@ -187,6 +189,10 @@ static const char *additional_attributes[] = {
   "srcset",                     /* used by tag_handle_img */
 };
 
+static unsigned int custom_tag_and_attribute_count;
+static struct known_tag *custom_tags;
+static struct known_attribute *custom_attributes;
+
 static struct hash_table *interesting_tags;
 static struct hash_table *interesting_attributes;
 
@@ -194,6 +200,85 @@ static struct hash_table *interesting_attributes;
    meta tags  */
 static char *meta_charset;
 
+static void
+init_custom_tags_and_attributes (void)
+{
+  unsigned int i = 0;
+  size_t len = 0;
+  const int *next_free_tagid;
+  char **custom;
+  char *name;
+
+  if (!opt.custom_html_attrs)
+    return;
+
+  /* Count the number of recognized tag/attribute pairs first.  */
+  for (i = 0, custom = opt.custom_html_attrs; *custom; custom++)
+    {
+      const char *delim_start = strchr (*custom, '/');
+      if (!delim_start)
+        ; // TODO handle missing delimiter
+      else
+        i++;
+    }
+
+  if (!i)
+    return;
+
+  custom_tag_and_attribute_count = i;
+  custom_tags = xnew_array (struct known_tag, custom_tag_and_attribute_count);
+  custom_attributes = xnew_array (struct known_attribute, custom_tag_and_attribute_count);
+
+  /* Allocate a temporary buffer of new tag IDs to refer as values
+     from the hash table below.  */
+  int *tmp_free_tagids = xnew_array (int, custom_tag_and_attribute_count);
+  for (i = 0; i < custom_tag_and_attribute_count; i++)
+    tmp_free_tagids[i] = LAST_KNOWN_TAG_ID + i + 1;
+  next_free_tagid = tmp_free_tagids;
+
+  struct hash_table *tmp_tag_to_tagid = make_nocase_string_hash_table (countof (known_tags));
+  for (i = 0; i < countof (known_tags); i++)
+    hash_table_put (tmp_tag_to_tagid, known_tags[i].name, &known_tags[i].tagid);
+
+  for (i = 0, custom = opt.custom_html_attrs; *custom; custom++)
+    {
+      const char * const delim_start = strchr (*custom, '/');
+      const int * tagid;
+      if (!delim_start)
+        continue;
+
+      /* Split the pattern and take and create the left substring as
+         a tag name, and then check if a particular tag ID can be reused
+         or a new one must be created.  */
+      len = delim_start - *custom;
+      name = xmemdup (*custom, len + 1);
+      name[len] = '\0';
+      tagid = hash_table_get (tmp_tag_to_tagid, name);
+      if (!tagid)
+        {
+          tagid = next_free_tagid;
+          hash_table_put (tmp_tag_to_tagid, name, tagid);
+          next_free_tagid++;
+        }
+      custom_tags[i].tagid = *tagid;
+      custom_tags[i].name = name;
+      custom_tags[i].handler = tag_find_urls;
+
+      /* Now take the right substring as a tag attribute name.  */
+      len = strlen (delim_start + 1);
+      name = xmemdup (delim_start + 1, len + 1);
+      name[len] = '\0';
+      custom_attributes[i].tagid = *tagid;
+      custom_attributes[i].attr_name = name;
+      custom_attributes[i].flags = ATTR_HTML; /* Covers ATTR_INLINE anyway. */
+
+      i++;
+    }
+
+    xfree (tmp_free_tagids);
+    hash_table_destroy (tmp_tag_to_tagid);
+}
+
 static void
 init_interesting (void)
 {
@@ -214,6 +299,9 @@ init_interesting (void)
   for (i = 0; i < countof (known_tags); i++)
     hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
 
+  for (i = 0; i < custom_tag_and_attribute_count; i++)
+    hash_table_put (interesting_tags, custom_tags[i].name, custom_tags + i);
+
   /* Then remove the tags ignored through --ignore-tags.  */
   if (opt.ignore_tags)
     {
@@ -247,6 +335,8 @@ init_interesting (void)
   for (i = 0; i < countof (tag_url_attributes); i++)
     hash_table_put (interesting_attributes,
                     tag_url_attributes[i].attr_name, "1");
+  for (i = 0; i < custom_tag_and_attribute_count; i++)
+    hash_table_put (interesting_attributes, custom_attributes[i].attr_name, "1");
 }
 
 /* Find the value of attribute named NAME in the taginfo TAG.  If the
@@ -421,15 +511,20 @@ check_style_attr (struct taginfo *tag, struct map_context *ctx)
 static void
 tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 {
+  int is_known = tagid <= LAST_KNOWN_TAG_ID;
   size_t i;
   int attrind;
   int first = -1;
+  const size_t size = is_known ? countof (tag_url_attributes)
+                               : custom_tag_and_attribute_count;
+  const struct known_attribute *attributes = is_known ? tag_url_attributes
+                                                      : custom_attributes;
 
-  for (i = 0; i < countof (tag_url_attributes); i++)
-    if (tag_url_attributes[i].tagid == tagid)
+  for (i = 0; i < size; i++)
+    if (attributes[i].tagid == tagid)
       {
-        /* We've found the index of tag_url_attributes where the
-           attributes of our tag begin.  */
+        /* We've found the index of tag_url_attributes or
+           custom_attributes where the attributes of our tag begin.  */
         first = i;
         break;
       }
@@ -449,22 +544,21 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
       /* Find whether TAG/ATTRIND is a combination that contains a
          URL. */
       char *link = tag->attrs[attrind].value;
-      const size_t size = countof (tag_url_attributes);
 
       /* If you're cringing at the inefficiency of the nested loops,
          remember that they both iterate over a very small number of
          items.  The worst-case inner loop is for the IMG tag, which
          has three attributes.  */
-      for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
+      for (i = first; i < size && attributes[i].tagid == tagid; i++)
         {
           if (0 == strcasecmp (tag->attrs[attrind].name,
-                               tag_url_attributes[i].attr_name))
+                               attributes[i].attr_name))
             {
               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
                                               ATTR_SIZE(tag,attrind), ctx);
               if (up)
                 {
-                  int flags = tag_url_attributes[i].flags;
+                  int flags = attributes[i].flags;
                   if (flags & ATTR_INLINE)
                     up->link_inline_p = 1;
                   if (flags & ATTR_HTML)
@@ -811,6 +905,9 @@ get_urls_html_fm (const char *file, const struct file_memory *fm,
   ctx.document_file = file;
   ctx.nofollow = false;
 
+  if (!custom_tags && !custom_attributes)
+    init_custom_tags_and_attributes ();
+
   if (!interesting_tags)
     init_interesting ();
 
@@ -966,6 +1063,24 @@ get_urls_file (const char *file)
 void
 cleanup_html_url (void)
 {
+  /* Destroy the dynamic arrays for custom tags and attributes
+     allocated in the initialization routine.  */
+  if (custom_tags)
+    {
+      unsigned int i;
+      for (i = 0; i < custom_tag_and_attribute_count; i++) {
+        xfree (custom_tags[i].name);
+      }
+      xfree (custom_tags);
+    }
+  if (custom_attributes)
+    {
+      unsigned int i;
+      for (i = 0; i < custom_tag_and_attribute_count; i++) {
+        xfree (custom_attributes[i].attr_name);
+      }
+      xfree (custom_attributes);
+    }
   /* Destroy the hash tables.  The hash table keys and values are not
      allocated by this code, so we don't need to free them here.  */
   if (interesting_tags)
diff --git a/src/init.c b/src/init.c
index eae5391b..9145d6af 100644
--- a/src/init.c
+++ b/src/init.c
@@ -177,6 +177,7 @@ static const struct {
 #ifdef HAVE_SSL
   { "crlfile",          &opt.crl_file,          cmd_file_once },
 #endif
+  { "customhtmlattrs",  &opt.custom_html_attrs, cmd_vector },
   { "cutdirs",          &opt.cut_dirs,          cmd_number },
   { "debug",            &opt.debug,             cmd_boolean },
   { "defaultpage",      &opt.default_page,      cmd_string },
@@ -1959,6 +1960,7 @@ cleanup (void)
   free_vec (opt.exclude_domains);
   free_vec (opt.follow_tags);
   free_vec (opt.ignore_tags);
+  free_vec (opt.custom_html_attrs);
   xfree (opt.progress_type);
   xfree (opt.warc_filename);
   xfree (opt.warc_tempdir);
diff --git a/src/main.c b/src/main.c
index 4d595ef0..6192cd35 100644
--- a/src/main.c
+++ b/src/main.c
@@ -298,6 +298,7 @@ static struct cmdline_option option_data[] =
     { "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
     { "cookies", 0, OPT_BOOLEAN, "cookies", -1 },
     { IF_SSL ("crl-file"), 0, OPT_VALUE, "crlfile", -1 },
+    { "custom-html-attrs", 0, OPT_VALUE, "customhtmlattrs", -1 },
     { "cut-dirs", 0, OPT_VALUE, "cutdirs", -1 },
     { "debug", 'd', OPT_BOOLEAN, "debug", -1 },
     { "default-page", 0, OPT_VALUE, "defaultpage", -1 },
@@ -1012,6 +1013,8 @@ Recursive accept/reject:\n"),
        --follow-tags=LIST          comma-separated list of followed HTML tags\n"),
     N_("\
        --ignore-tags=LIST          comma-separated list of ignored HTML tags\n"),
+    N_("\
+       --custom-html-attrs=LIST    comma-separated list of new followed HTML attributes\n"),
     N_("\
   -H,  --span-hosts                go to foreign hosts when recursive\n"),
     N_("\
diff --git a/src/options.h b/src/options.h
index 9a02f3aa..6c7efe59 100644
--- a/src/options.h
+++ b/src/options.h
@@ -112,6 +112,7 @@ struct options
 
   char **follow_tags;           /* List of HTML tags to recursively follow. */
   char **ignore_tags;           /* List of HTML tags to ignore if recursing. */
+  char **custom_html_attrs;     /* List of custom HTML tags and attributes delimited by '/'. */
 
   bool follow_ftp;              /* Are FTP URL-s followed in recursive
                                    retrieving? */
-- 
2.25.0

Add --custom-html-attrs option to support custom HTML tags and attributes

Reply via email to