There have been several requests to reuse the wget html parser for finding links or requisites. See "wget questions" thread from August, or "wget source code about -p option" from June.

These two patches provide a new binary doing just that.
The main drawback of the current code is that it misses the connection into the autoconf-generated Makefile. Maybe there is here someone literate on autoconf that can help on that bit?

Meanwhile you can compile it with:
gcc -o get-urls get-urls.c html-url.c html-parse.c hash.c log.c url.c ../lib/c-ctype.c ../lib/quotearg.c ../lib/c-strcasecmp.c ../lib/xalloc-die.c ../lib/xmalloc.c css_.c css-url.c iri.c utils.c ../lib/regex.c ../lib/exitfail.c exits.c host.c ../lib/localcharset.c ../lib/ioctl.c -I../lib -I. -lidn -lpcre

(you will need to compile wget first for having some of those files created)

Regards

>From 244585ebf54233eb2103d19e2325da3dcb55ec3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= <[email protected]>
Date: Wed, 19 Jun 2013 17:20:48 +0200
Subject: [PATCH 1/2] Moved free_urlpos()

---
 src/html-url.c | 15 +++++++++++++++
 src/retr.c     | 15 ---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/html-url.c b/src/html-url.c
index bb2b20e..932b29d 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -858,3 +858,18 @@ cleanup_html_url (void)
   if (interesting_attributes)
     hash_table_destroy (interesting_attributes);
 }
+
+/* Free the linked list of urlpos.  */
+void
+free_urlpos (struct urlpos *l)
+{
+  while (l)
+    {
+      struct urlpos *next = l->next;
+      if (l->url)
+        url_free (l->url);
+      xfree_null (l->local_name);
+      xfree (l);
+      l = next;
+    }
+}
diff --git a/src/retr.c b/src/retr.c
index 683c811..714c78a 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -1166,21 +1166,6 @@ sleep_between_retrievals (int count)
     }
 }
 
-/* Free the linked list of urlpos.  */
-void
-free_urlpos (struct urlpos *l)
-{
-  while (l)
-    {
-      struct urlpos *next = l->next;
-      if (l->url)
-        url_free (l->url);
-      xfree_null (l->local_name);
-      xfree (l);
-      l = next;
-    }
-}
-
 /* Rotate FNAME opt.backups times */
 void
 rotate_backups(const char *fname)
-- 
1.8.4

>From 57327307c1ce56f0478e95bb32b8818ec0d9aa78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= <[email protected]>
Date: Mon, 16 Sep 2013 01:33:40 +0200
Subject: [PATCH 2/2] Expose wget functionality for extracting links from a web
 page.

Provided by a new program called get-urls
---
 src/get-urls.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 src/get-urls.c

diff --git a/src/get-urls.c b/src/get-urls.c
new file mode 100644
index 0000000..9393a62
--- /dev/null
+++ b/src/get-urls.c
@@ -0,0 +1,75 @@
+#include "wget.h"
+
+#include <quote.h>
+#include <stdio.h>
+
+#include "url.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
+
+
+void
+print_urls (const char *file, const char *url, bool is_css)
+{
+  bool meta_disallow_follow = false; /* Output value */
+  struct urlpos *child;
+  struct urlpos *children
+    = is_css ? get_urls_css_file (file, url) :
+               get_urls_html (file, url, &meta_disallow_follow, NULL);
+
+  printf ("# %s\n", url);
+
+  child = children;
+  for (; child; child = child->next)
+    {
+      printf ("%s #", child->url->url);
+      
+      if (child->ignore_when_downloading)
+        printf(" ignore");
+      if (child->link_relative_p)
+        printf(" relative");
+      if (child->link_complete_p)
+        printf(" complete");
+      if (child->link_base_p)
+        printf(" base");
+      if (child->link_inline_p)
+        printf(" inline");
+      if (child->link_css_p)
+        printf(" fromcss");
+      if (child->link_expect_html)
+        printf(" html");
+      if (child->link_expect_css)
+        printf(" css");
+      if (child->link_refresh_p)
+        printf(" refresh");
+      printf("\n");
+	}
+
+  free_urlpos (children);
+}
+
+const char *exec_name = "get-urls";
+struct options opt;
+
+int
+main (int argc, char *argv[])
+{
+  bool is_css = false;
+
+  if (argc > 1 && !strcmp("--css", argv[1]))
+    {
+      is_css = true;
+      argc--;
+      argv++;
+	}
+	  
+  if (argc < 2)
+    {
+      fprintf(stderr, _("Usage: %s [--css] file original-URL\n"), exec_name);
+      return 1;
+    }
+  
+  print_urls (argv[1], argv[2], is_css);
+  return 0;
+}
-- 
1.8.4

Reply via email to