[Bug-wget] Patch for understanding srcset= on img tags.

Maksim Orlovich Mon, 29 Feb 2016 16:05:57 -0800

Hi... wget currently doesn't understand HTML5's srcset= attribute on
images. The attached adds support for it.
This is under Google copyright, so should be covered by the company's
copyright assignment with the FSF.


If you might be interested in incorporating this in some form, please
let me know if you want any changes (e.g. tests, etc.), ---
not really familiar with how you folks do things.

Hoping this may be of some use to someone else,
Maks

From 591a89a3d7f42b5d0d7cb6936a97dc4770b64e97 Mon Sep 17 00:00:00 2001
From: Maks Orlovich <[email protected]>
Date: Mon, 29 Feb 2016 13:40:45 -0500
Subject: Parse <img srcset> attributes, they have image URLs.

* src/convert.h: Add link_noquote_html_p to permit rewriting URLs deep
                 inside attributes without adding extraneous quoting
* src/convert.c (convert_links): Honor link_noquote_html_p
* src/html_url.c (tag_handle_img): New function. Add srcset parsing.

diff --git a/src/convert.c b/src/convert.c
index df8d58d..509923e 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -308,7 +308,7 @@ convert_links (const char *file, struct urlpos *links)
             char *quoted_newname = local_quote_string (newname,
                                                        link->link_css_p);
 
-            if (link->link_css_p)
+            if (link->link_css_p || link->link_noquote_html_p)
               p = replace_plain (p, link->size, fp, quoted_newname);
             else if (!link->link_refresh_p)
               p = replace_attr (p, link->size, fp, quoted_newname);
@@ -329,7 +329,7 @@ convert_links (const char *file, struct urlpos *links)
             char *newname = convert_basename (p, link);
             char *quoted_newname = local_quote_string (newname, 
link->link_css_p);
 
-            if (link->link_css_p)
+            if (link->link_css_p || link->link_noquote_html_p)
               p = replace_plain (p, link->size, fp, quoted_newname);
             else if (!link->link_refresh_p)
               p = replace_attr (p, link->size, fp, quoted_newname);
@@ -352,7 +352,7 @@ convert_links (const char *file, struct urlpos *links)
             char *newlink = link->url->url;
             char *quoted_newlink = html_quote_string (newlink);
 
-            if (link->link_css_p)
+            if (link->link_css_p || link->link_noquote_html_p)
               p = replace_plain (p, link->size, fp, newlink);
             else if (!link->link_refresh_p)
               p = replace_attr (p, link->size, fp, quoted_newlink);
diff --git a/src/convert.h b/src/convert.h
index b3cd196..e3ff6f0 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -69,6 +69,7 @@ struct urlpos {
   unsigned int link_base_p  :1;     /* the url came from <base href=...> */
   unsigned int link_inline_p    :1; /* needed to render the page */
   unsigned int link_css_p   :1;     /* the url came from CSS */
+  unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
   unsigned int link_expect_html :1; /* expected to contain HTML */
   unsigned int link_expect_css  :1; /* expected to contain CSS */
 
diff --git a/src/html-url.c b/src/html-url.c
index 0743587..dff8d57 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -56,6 +56,7 @@ typedef void (*tag_handler_t) (int, struct taginfo *, struct 
map_context *);
 DECLARE_TAG_HANDLER (tag_find_urls);
 DECLARE_TAG_HANDLER (tag_handle_base);
 DECLARE_TAG_HANDLER (tag_handle_form);
+DECLARE_TAG_HANDLER (tag_handle_img);
 DECLARE_TAG_HANDLER (tag_handle_link);
 DECLARE_TAG_HANDLER (tag_handle_meta);
 
@@ -105,7 +106,7 @@ static struct known_tag {
   { TAG_FORM,    "form",        tag_handle_form },
   { TAG_FRAME,   "frame",       tag_find_urls },
   { TAG_IFRAME,  "iframe",      tag_find_urls },
-  { TAG_IMG,     "img",         tag_find_urls },
+  { TAG_IMG,     "img",         tag_handle_img },
   { TAG_INPUT,   "input",       tag_find_urls },
   { TAG_LAYER,   "layer",       tag_find_urls },
   { TAG_LINK,    "link",        tag_handle_link },
@@ -183,7 +184,8 @@ static const char *additional_attributes[] = {
   "name",                       /* used by tag_handle_meta  */
   "content",                    /* used by tag_handle_meta  */
   "action",                     /* used by tag_handle_form  */
-  "style"                       /* used by check_style_attr */
+  "style",                      /* used by check_style_attr */
+  "srcset",                     /* used by tag_handle_img */
 };
 
 static struct hash_table *interesting_tags;
@@ -674,6 +676,88 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo 
*tag, struct map_context *
     }
 }
 
+/* Handle the IMG tag.  This requires special handling for the srcset attr,
+   while the traditional src/lowsrc/href attributes can be handled generically.
+*/
+
+static void
+tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
+  int attrind;
+  char *srcset;
+
+  /* Use the generic approach for the attributes without special syntax. */
+  tag_find_urls(tagid, tag, ctx);
+
+  srcset = find_attr (tag, "srcset", &attrind);
+  if (srcset)
+    {
+      /* These are relative to the input text. */
+      int base_ind = ATTR_POS(tag,attrind,ctx);
+      int size = strlen(srcset);
+
+      /* These are relative to srcset. */
+      int offset, url_start, url_end;
+
+      /* Make sure to line up base_ind with srcset[0], not outside quotes. */
+      if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
+        ++base_ind;
+
+      offset = 0;
+      while (offset < size)
+        {
+          bool has_descriptor = true;
+
+          /* Skip over initial whitespace and commas. Note there is no \v
+            in HTML5 whitespace. */
+          url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
+
+          if (url_start == size)
+            return;
+
+          /* URL is any non-whitespace chars (including commas) - but with
+             trailing commas removed. */
+          url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
+          while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
+            {
+              has_descriptor = false;
+              --url_end;
+            }
+
+          if (url_end > url_start)
+            {
+              char *url_text = strdupdelim (srcset + url_start,
+                                            srcset + url_end);
+              struct urlpos *up = append_url (url_text, base_ind + url_start,
+                                              url_end - url_start, ctx);
+              up->link_inline_p = 1;
+              up->link_noquote_html_p = 1;
+              xfree (url_text);
+            }
+
+          /* If the URL wasn't terminated by a , there may also be a descriptor
+             which we just skip. */
+          if (has_descriptor)
+            {
+              /* This is comma-terminated, except there may be one level of
+                 parentheses escaping that. */
+              bool in_paren = false;
+              for (offset = url_end; offset < size; ++offset)
+                {
+                  char c = srcset[offset];
+                  if (c == '(')
+                    in_paren = true;
+                  else if (c == '(' && in_paren)
+                    in_paren = false;
+                  else if (c == ',' && !in_paren)
+                    break;
+                }
+            }
+          else
+            offset = url_end;
+        }
+    }
+}
+
 /* Dispatch the tag handler appropriate for the tag we're mapping
    over.  See known_tags[] for definition of tag handlers.  */

[Bug-wget] Patch for understanding srcset= on img tags.

Reply via email to