Re: [Bug-wget] Patch for understanding srcset= on img tags.

Maksim Orlovich Tue, 01 Mar 2016 07:02:27 -0800

> thanks for your patch!  I have some comments.  Please amend this:
>
> diff --git a/src/html-url.c b/src/html-url.c
> index dff8d57..2f205c7 100644
> --- a/src/html-url.c
> +++ b/src/html-url.c
> @@ -692,8 +692,8 @@ tag_handle_img (int tagid, struct taginfo *tag, struct 
> map_context *ctx) {
>    if (srcset)
>      {
>        /* These are relative to the input text. */
> -      int base_ind = ATTR_POS(tag,attrind,ctx);
> -      int size = strlen(srcset);
> +      int base_ind = ATTR_POS (tag, attrind, ctx);
> +      int size = strlen (srcset);


Done.


> should the condition be (c == ')' && in_paren)  ?

Indeed.

Thanks,
Maks

From 49933c84012536388e1f9d0bc4070e377d824309 Mon Sep 17 00:00:00 2001
From: Maks Orlovich <[email protected]>
Date: Tue, 1 Mar 2016 09:43:56 -0500
Subject: Parse <img srcset> attributes, they have image URLs.

* src/convert.h: Add link_noquote_html_p to permit rewriting URLs deep
                 inside attributes without adding extraneous quoting
* src/convert.c (convert_links): Honor link_noquote_html_p
* src/html_url.c (tag_handle_img): New function. Add srcset parsing.

diff --git a/src/convert.c b/src/convert.c
index df8d58d..509923e 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -308,7 +308,7 @@ convert_links (const char *file, struct urlpos *links)
             char *quoted_newname = local_quote_string (newname,
                                                        link->link_css_p);
 
-            if (link->link_css_p)
+            if (link->link_css_p || link->link_noquote_html_p)
               p = replace_plain (p, link->size, fp, quoted_newname);
             else if (!link->link_refresh_p)
               p = replace_attr (p, link->size, fp, quoted_newname);
@@ -329,7 +329,7 @@ convert_links (const char *file, struct urlpos *links)
             char *newname = convert_basename (p, link);
             char *quoted_newname = local_quote_string (newname, 
link->link_css_p);
 
-            if (link->link_css_p)
+            if (link->link_css_p || link->link_noquote_html_p)
               p = replace_plain (p, link->size, fp, quoted_newname);
             else if (!link->link_refresh_p)
               p = replace_attr (p, link->size, fp, quoted_newname);
@@ -352,7 +352,7 @@ convert_links (const char *file, struct urlpos *links)
             char *newlink = link->url->url;
             char *quoted_newlink = html_quote_string (newlink);
 
-            if (link->link_css_p)
+            if (link->link_css_p || link->link_noquote_html_p)
               p = replace_plain (p, link->size, fp, newlink);
             else if (!link->link_refresh_p)
               p = replace_attr (p, link->size, fp, quoted_newlink);
diff --git a/src/convert.h b/src/convert.h
index b3cd196..e3ff6f0 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -69,6 +69,7 @@ struct urlpos {
   unsigned int link_base_p  :1;     /* the url came from <base href=...> */
   unsigned int link_inline_p    :1; /* needed to render the page */
   unsigned int link_css_p   :1;     /* the url came from CSS */
+  unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
   unsigned int link_expect_html :1; /* expected to contain HTML */
   unsigned int link_expect_css  :1; /* expected to contain CSS */
 
diff --git a/src/html-url.c b/src/html-url.c
index 0743587..ab04204 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -56,6 +56,7 @@ typedef void (*tag_handler_t) (int, struct taginfo *, struct 
map_context *);
 DECLARE_TAG_HANDLER (tag_find_urls);
 DECLARE_TAG_HANDLER (tag_handle_base);
 DECLARE_TAG_HANDLER (tag_handle_form);
+DECLARE_TAG_HANDLER (tag_handle_img);
 DECLARE_TAG_HANDLER (tag_handle_link);
 DECLARE_TAG_HANDLER (tag_handle_meta);
 
@@ -105,7 +106,7 @@ static struct known_tag {
   { TAG_FORM,    "form",        tag_handle_form },
   { TAG_FRAME,   "frame",       tag_find_urls },
   { TAG_IFRAME,  "iframe",      tag_find_urls },
-  { TAG_IMG,     "img",         tag_find_urls },
+  { TAG_IMG,     "img",         tag_handle_img },
   { TAG_INPUT,   "input",       tag_find_urls },
   { TAG_LAYER,   "layer",       tag_find_urls },
   { TAG_LINK,    "link",        tag_handle_link },
@@ -183,7 +184,8 @@ static const char *additional_attributes[] = {
   "name",                       /* used by tag_handle_meta  */
   "content",                    /* used by tag_handle_meta  */
   "action",                     /* used by tag_handle_form  */
-  "style"                       /* used by check_style_attr */
+  "style",                      /* used by check_style_attr */
+  "srcset",                     /* used by tag_handle_img */
 };
 
 static struct hash_table *interesting_tags;
@@ -674,6 +676,88 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo 
*tag, struct map_context *
     }
 }
 
+/* Handle the IMG tag.  This requires special handling for the srcset attr,
+   while the traditional src/lowsrc/href attributes can be handled generically.
+*/
+
+static void
+tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
+  int attrind;
+  char *srcset;
+
+  /* Use the generic approach for the attributes without special syntax. */
+  tag_find_urls(tagid, tag, ctx);
+
+  srcset = find_attr (tag, "srcset", &attrind);
+  if (srcset)
+    {
+      /* These are relative to the input text. */
+      int base_ind = ATTR_POS (tag,attrind,ctx);
+      int size = strlen (srcset);
+
+      /* These are relative to srcset. */
+      int offset, url_start, url_end;
+
+      /* Make sure to line up base_ind with srcset[0], not outside quotes. */
+      if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
+        ++base_ind;
+
+      offset = 0;
+      while (offset < size)
+        {
+          bool has_descriptor = true;
+
+          /* Skip over initial whitespace and commas. Note there is no \v
+            in HTML5 whitespace. */
+          url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
+
+          if (url_start == size)
+            return;
+
+          /* URL is any non-whitespace chars (including commas) - but with
+             trailing commas removed. */
+          url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
+          while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
+            {
+              has_descriptor = false;
+              --url_end;
+            }
+
+          if (url_end > url_start)
+            {
+              char *url_text = strdupdelim (srcset + url_start,
+                                            srcset + url_end);
+              struct urlpos *up = append_url (url_text, base_ind + url_start,
+                                              url_end - url_start, ctx);
+              up->link_inline_p = 1;
+              up->link_noquote_html_p = 1;
+              xfree (url_text);
+            }
+
+          /* If the URL wasn't terminated by a , there may also be a descriptor
+             which we just skip. */
+          if (has_descriptor)
+            {
+              /* This is comma-terminated, except there may be one level of
+                 parentheses escaping that. */
+              bool in_paren = false;
+              for (offset = url_end; offset < size; ++offset)
+                {
+                  char c = srcset[offset];
+                  if (c == '(')
+                    in_paren = true;
+                  else if (c == ')' && in_paren)
+                    in_paren = false;
+                  else if (c == ',' && !in_paren)
+                    break;
+                }
+            }
+          else
+            offset = url_end;
+        }
+    }
+}
+
 /* Dispatch the tag handler appropriate for the tag we're mapping
    over.  See known_tags[] for definition of tag handlers.  */

Re: [Bug-wget] Patch for understanding srcset= on img tags.

Reply via email to