There have been several requests to reuse the wget html parser for
finding links or requisites.
See "wget questions" thread from August, or "wget source code about -p
option" from June.
These two patches provide a new binary doing just that.
The main drawback of the current code is that it misses the connection
into the autoconf-generated Makefile. Maybe there is here someone
literate on autoconf that can help on that bit?
Meanwhile you can compile it with:
gcc -o get-urls get-urls.c html-url.c html-parse.c hash.c log.c
url.c ../lib/c-ctype.c ../lib/quotearg.c ../lib/c-strcasecmp.c
../lib/xalloc-die.c ../lib/xmalloc.c css_.c css-url.c iri.c utils.c
../lib/regex.c ../lib/exitfail.c exits.c host.c ../lib/localcharset.c
../lib/ioctl.c -I../lib -I. -lidn -lpcre
(you will need to compile wget first for having some of those files created)
Regards
>From 244585ebf54233eb2103d19e2325da3dcb55ec3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= <[email protected]>
Date: Wed, 19 Jun 2013 17:20:48 +0200
Subject: [PATCH 1/2] Moved free_urlpos()
---
src/html-url.c | 15 +++++++++++++++
src/retr.c | 15 ---------------
2 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/src/html-url.c b/src/html-url.c
index bb2b20e..932b29d 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -858,3 +858,18 @@ cleanup_html_url (void)
if (interesting_attributes)
hash_table_destroy (interesting_attributes);
}
+
+/* Free the linked list of urlpos. */
+void
+free_urlpos (struct urlpos *l)
+{
+ while (l)
+ {
+ struct urlpos *next = l->next;
+ if (l->url)
+ url_free (l->url);
+ xfree_null (l->local_name);
+ xfree (l);
+ l = next;
+ }
+}
diff --git a/src/retr.c b/src/retr.c
index 683c811..714c78a 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -1166,21 +1166,6 @@ sleep_between_retrievals (int count)
}
}
-/* Free the linked list of urlpos. */
-void
-free_urlpos (struct urlpos *l)
-{
- while (l)
- {
- struct urlpos *next = l->next;
- if (l->url)
- url_free (l->url);
- xfree_null (l->local_name);
- xfree (l);
- l = next;
- }
-}
-
/* Rotate FNAME opt.backups times */
void
rotate_backups(const char *fname)
--
1.8.4
>From 57327307c1ce56f0478e95bb32b8818ec0d9aa78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= <[email protected]>
Date: Mon, 16 Sep 2013 01:33:40 +0200
Subject: [PATCH 2/2] Expose wget functionality for extracting links from a web
page.
Provided by a new program called get-urls
---
src/get-urls.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
create mode 100644 src/get-urls.c
diff --git a/src/get-urls.c b/src/get-urls.c
new file mode 100644
index 0000000..9393a62
--- /dev/null
+++ b/src/get-urls.c
@@ -0,0 +1,75 @@
+#include "wget.h"
+
+#include <quote.h>
+#include <stdio.h>
+
+#include "url.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
+
+
+void
+print_urls (const char *file, const char *url, bool is_css)
+{
+ bool meta_disallow_follow = false; /* Output value */
+ struct urlpos *child;
+ struct urlpos *children
+ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow, NULL);
+
+ printf ("# %s\n", url);
+
+ child = children;
+ for (; child; child = child->next)
+ {
+ printf ("%s #", child->url->url);
+
+ if (child->ignore_when_downloading)
+ printf(" ignore");
+ if (child->link_relative_p)
+ printf(" relative");
+ if (child->link_complete_p)
+ printf(" complete");
+ if (child->link_base_p)
+ printf(" base");
+ if (child->link_inline_p)
+ printf(" inline");
+ if (child->link_css_p)
+ printf(" fromcss");
+ if (child->link_expect_html)
+ printf(" html");
+ if (child->link_expect_css)
+ printf(" css");
+ if (child->link_refresh_p)
+ printf(" refresh");
+ printf("\n");
+ }
+
+ free_urlpos (children);
+}
+
+const char *exec_name = "get-urls";
+struct options opt;
+
+int
+main (int argc, char *argv[])
+{
+ bool is_css = false;
+
+ if (argc > 1 && !strcmp("--css", argv[1]))
+ {
+ is_css = true;
+ argc--;
+ argv++;
+ }
+
+ if (argc < 2)
+ {
+ fprintf(stderr, _("Usage: %s [--css] file original-URL\n"), exec_name);
+ return 1;
+ }
+
+ print_urls (argv[1], argv[2], is_css);
+ return 0;
+}
--
1.8.4