Update of /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget
In directory james.mmbase.org:/tmp/cvs-serv32280/src/org/mmbase/mmget
Modified Files:
HTMLReader.java MMGet.java UrlReaders.java
Log Message:
works, but memory issues with very big sites
See also:
http://cvs.mmbase.org/viewcvs/speeltuin/andre/mmget/src/org/mmbase/mmget
Index: HTMLReader.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/HTMLReader.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- HTMLReader.java 27 Feb 2009 10:45:07 -0000 1.2
+++ HTMLReader.java 1 Mar 2009 11:26:11 -0000 1.3
@@ -15,7 +15,7 @@
* Reads a web resource an returns its tags that may contain links to other
resources.
*
* @author André van Toly
- * @version $Id: HTMLReader.java,v 1.2 2009/02/27 10:45:07 andre Exp $
+ * @version $Id: HTMLReader.java,v 1.3 2009/03/01 11:26:11 andre Exp $
*/
public class HTMLReader extends UrlReader {
private static final Logger log =
Logging.getLoggerInstance(HTMLReader.class);
@@ -55,7 +55,7 @@
while ((tag = nextTag()) != null) {
for (int i = 0; i < wantTags.length; i++) {
if (tag.startsWith(wantTags[i])) {
- String link = readLinkformTag(tag);
+ String link = extractHREF(tag);
if (link != null) al.add(link);
continue; // optimization
}
@@ -68,17 +68,6 @@
return MMGet.contentType(uc);
}
- public static String readLinkformTag(String tag) {
- String href = null;
- href = MMGet.extractHREF(tag);
-
- if (href.startsWith("mailto") || href.startsWith("#") ||
href.startsWith("javascript")) {
- //log.info(href + " -- NOT FOLLOWING (yet)"); // Can't be used
(for now), TODO: todo's here?
- return null;
- }
- return href;
- }
-
/**
* Reads a tags and its contents.
* @return the tag
@@ -93,6 +82,45 @@
}
/**
+ * Extracts the link from a tag.
+ *
+ * @param tag the first parameter
+ * @return a link to a resource hopefully
+ */
+ public static String extractHREF(String tag) {
+ String lcTag = tag.toLowerCase();
+ String attr;
+ int p1, p2, p3, p4;
+
+ if (lcTag.startsWith("<a ") || lcTag.startsWith("<link ") ||
lcTag.startsWith("<area ")) {
+ attr = "href=";
+ } else {
+ attr = "src="; // TODO: src's of css in html
+ }
+
+ p1 = lcTag.indexOf(attr);
+ if (p1 < 0) {
+ log.warn("Can't find attribute '" + attr + "' in '" + tag + "'");
+ return null;
+ }
+
+ p2 = tag.indexOf("=", p1);
+ p3 = tag.indexOf("\"", p2);
+ p4 = tag.indexOf("\"", p3 + 1);
+ if (p3 < 0 || p4 < 0) {
+ log.warn("Invalide attribute '" + attr + "' in '" + tag + "'");
+ }
+
+ String href = tag.substring(p3 + 1, p4);
+ if (href.startsWith("mailto") || href.startsWith("#") ||
href.startsWith("javascript")) {
+ //log.info(href + " -- NOT FOLLOWING (yet)"); // Can't be used
(for now), TODO: todo's here?
+ return null;
+ }
+
+ return href;
+ }
+
+ /**
* Read the next tag
* @return a complete tag, like <img scr="foo.gif" />
*/
Index: MMGet.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/MMGet.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- MMGet.java 27 Feb 2009 15:30:52 -0000 1.4
+++ MMGet.java 1 Mar 2009 11:26:11 -0000 1.5
@@ -25,7 +25,7 @@
* TODO: init rootURL early on, and check all urls against it (so we don't
travel up the rootURL)
*
* @author André van Toly
- * @version $Id: MMGet.java,v 1.4 2009/02/27 15:30:52 andre Exp $
+ * @version $Id: MMGet.java,v 1.5 2009/03/01 11:26:11 andre Exp $
*/
public final class MMGet {
@@ -46,7 +46,7 @@
public static String directory;
protected static File savedir;
- /* not wanted: offsite etc. */
+ /* not wanted: offsite, already tried but 404 etc. */
protected Set<URL> ignoredURLs = new HashSet<URL>();
/* urls to parse (html, css) */
protected List<URL> parseURLs = Collections.synchronizedList(new
ArrayList<URL>());
@@ -86,9 +86,17 @@
ResourceLoader webroot = ResourceLoader.getWebRoot();
startURL = new URL(url);
- startdirURL = getDirectoryURL(startURL);
- if (startdirURL.toString().length() > startURL.toString().length())
+ String strUrl = startURL.toString();
+ if (strUrl.lastIndexOf("/") > 5) {
+ strUrl = strUrl.substring(0, strUrl.lastIndexOf("/") + 1);
+ startdirURL = new URL(strUrl);
+ } else {
+ startdirURL = startURL;
+ }
+ //startdirURL = getDirectoryURL(startURL);
+ /*if (startdirURL.toString().length() > startURL.toString().length())
startURL = startdirURL;
+ */
// savedir
if (directory == null || "".equals(directory) ||
!webroot.getResource(directory).openConnection().getDoInput()) {
@@ -167,6 +175,12 @@
return "Error: " + status;
}
+ StringBuilder info = new StringBuilder();
+ info.append("\n*** url: ").append(startURL.toString());
+ info.append("\n** dir.: ").append(startdirURL.toString());
+ info.append("\n* saved in: ").append(savedir.toString());
+ log.info(info.toString());
+
Future<String> fthread = ThreadPools.jobsExecutor.submit(new
Callable() {
public String call() {
return start();
@@ -181,13 +195,9 @@
log.error(e);
}
- StringBuilder info = new StringBuilder(status);
- info.append("\n*** url: ").append(startURL.toString());
- info.append("\n** dir.: ").append(startdirURL.toString());
- info.append("\n* saved in: ").append(savedir.toString());
- status = info.toString();
+ info.append(status);
log.info(status);
- return status;
+ return info.toString();
}
/**
@@ -212,11 +222,16 @@
log.debug("---------------------------------------------------------------------");
log.debug("reading: " + url.toString());
+ URL dirURL;
UrlReader reader = null;
try {
reader = UrlReaders.getUrlReader(url);
+ dirURL = getDirectoryURL(url);
+ } catch (MalformedURLException e) {
+ log.error("Can't parse '" + url + "' - " + e);
+ return;
} catch (IOException e) {
- log.error("Can't parse: " + e);
+ log.error("Can't parse '" + url + "' - " + e);
return;
}
if (reader == null) return;
@@ -225,8 +240,7 @@
ArrayList<String> links = reader.getLinks();
Map<String,String> links2files = new HashMap<String,String>();
/* maps a harvested link to the resulting saved file if different */
- URL dirURL = getDirectoryURL(url);
- if (startdirURL == null) startdirURL = dirURL;
+ //if (startdirURL == null) startdirURL = dirURL;
String calcUrl = startdirURL.toString() + makeFilename(url,
reader.getContentType());
String calcDir = calcUrl.substring(0, calcUrl.lastIndexOf("/"));
@@ -238,12 +252,21 @@
while (it.hasNext()) {
String link = it.next();
link = removeSessionid(link); // remove sessionid crud etc.
(changes over time)
-
URL linkURL;
if (link.indexOf("://") < 0) {
+ try {
linkURL = new URL(url, link);
+ } catch (MalformedURLException e) {
+ log.warn("Can't parse '" + link + "' - " + e);
+ continue;
+ }
} else {
+ try {
linkURL = new URL(link);
+ } catch (MalformedURLException e) {
+ log.warn("Can't parse '" + link + "' - " + e);
+ continue;
+ }
}
if (ignoredURLs.contains(linkURL)) continue;
@@ -252,8 +275,9 @@
ignoredURLs.add(linkURL);
continue;
}
- if (linkURL.toString().length() <
startdirURL.toString().length()) { // BUG: Klopt niet!
- //log.info(linkURL.toString() + " -- UP TREE, not
following");
+ if (!linkURL.toString().startsWith(startdirURL.toString())) {
+ // if (linkURL.toString().length() <
startdirURL.toString().length()) { // BUG: Klopt niet!
+ log.info(linkURL.toString() + " -- UP TREE, not
following");
ignoredURLs.add(linkURL);
continue;
}
@@ -261,6 +285,7 @@
// save resource
String filename = saveResource(linkURL);
if (filename == null) continue;
+ //log.debug("filename: " + filename);
// !!? String dir = dirURL.toString(); /* remove last / from
dir for UriParser */
// !!? if (dir.endsWith("/")) dir = dir.substring(0,
dir.lastIndexOf("/"));
@@ -269,7 +294,7 @@
String relative = UriParser.makeRelative(calcDir, calclink);
if (!"".equals(link) && !links2files.containsKey(link) &&
!link.equals(relative)) { // only when different
- log.debug("link: " + link + ", relative: " + relative);
+ //log.debug("link: " + link + ", relative: " + relative);
links2files.put(link, relative); /* /dir/css/bla.css +
../css/bla.css */
}
@@ -299,6 +324,7 @@
if (savedURLs.containsKey(url)) {
return savedURLs.get(url);
}
+ if (ignoredURLs.contains(url)) return null;
URLConnection uc = null;
try {
@@ -306,7 +332,10 @@
} catch (SocketException e) {
log.warn(e);
}
- if (uc == null) return null;
+ if (uc == null) {
+ ignoredURLs.add(url);
+ return null;
+ }
int type = contentType(uc);
if (type > 0) {
@@ -374,25 +403,24 @@
StringBuilder sbf = new StringBuilder();
sbf.append("\"").append(file).append("\"");
- int pos = line.indexOf(sbl.toString());
- if (pos > -1) {
- int pos2 = line.indexOf("\"", pos + 1);
- log.debug("pos: " + pos + ", pos2: " + pos2);
- String linelink = line.substring(pos, pos2 + 1);
- log.debug("linelink: " + linelink);
+ int pos1 = line.indexOf(sbl.toString());
+ if (pos1 > -1) {
+ int pos2 = line.indexOf("\"", pos1 + 1);
+ //log.debug("pos1: " + pos1 + ", pos2: " + pos2);
+ String linelink = line.substring(pos1, pos2 + 1);
+ //log.debug("linelink: " + linelink);
- // compensate for
;jsessionid=ECF5A0BB7709202CEDC4D7FBA3AC3AAD
- if ((pos2 - pos) > link.length() &&
linelink.indexOf(";") > -1) {
+ // compensate for
;jsessionid=ECF5A0BB7709202CEDC4D7FBA3AC3AAD etc.
+ if ((pos2 - pos1) > link.length() &&
linelink.indexOf(";") > -1) {
link = linelink;
} else {
sbl.append("\"");
link = sbl.toString();
}
- log.debug("link: " + link);
+ //log.debug("link: " + link);
- //sbl.append("\"");
line = line.replace(link, sbf.toString());
- log.debug("replaced '" + link + "' with '" + sbf + "'
in: " + filename);
+ //log.debug("replaced '" + link + "' with '" + sbf +
"' in: " + filename);
}
}
}
@@ -484,10 +512,24 @@
*
* @param url resource for which a filename is needed
* @param type content-type of the file to save
- * @return path and filename that can be saved (f.e. dir/bla)
+ * @return path and filename that can be saved (f.e. pics/button.gif)
*/
public String makeFilename(URL url, int type) {
- String filename = "";
+ /*
+
+ start: www.toly.nl/bla
+ link: www.toly.nl/pics/button.gif
+
+ filename: 1up/pics/buttons.gif
+
+ start: www.toly.nl/bla/bla
+ link: www.toly.nl/pics/button.gif
+
+ filename: 2up/pics/buttons.gif
+
+ */
+ String filename = url.getFile();
+ filename = removeSessionid(filename);
String link = url.toString();
link = removeSessionid(link);
@@ -603,38 +645,15 @@
return (i != -1 && i != file.length() - 1);
}
- /**
- * Extracts the link from a tag.
- *
- * @param tag the first parameter
- * @return a link to a resource hopefully
- */
- public static String extractHREF(String tag) {
- String lcTag = tag.toLowerCase();
- String attr;
- int p1, p2, p3, p4;
-
- if (lcTag.startsWith("<a ") || lcTag.startsWith("<link ") ||
lcTag.startsWith("<area ")) {
- attr = "href";
- } else {
- attr = "src"; // TODO: src's of css in html
+/*
+ public List splitPath(String path) {
+ List<String> pathList = new ArrayList<String>();
+ for (String p: path.split("/")) {
+ if (!p.equals("")) pathList.add(p);
}
-
- p1 = lcTag.indexOf(attr);
- if (p1 < 0) {
- log.warn("Can't find attribute '" + attr + "' in '" + tag + "'");
+ return pathList;
}
- p2 = tag.indexOf("=", p1);
- p3 = tag.indexOf("\"", p2);
- p4 = tag.indexOf("\"", p3 + 1);
- if (p3 < 0 || p4 < 0) {
- log.warn("Invalide attribute '" + attr + "' in '" + tag + "'");
- }
-
- String href = tag.substring(p3 + 1, p4);
- return href;
- }
-
+*/
private void addParseURL(URL url) {
synchronized(parseURLs) {
if (!parseURLs.contains(url)) parseURLs.add(url);
Index: UrlReaders.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/UrlReaders.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- UrlReaders.java 27 Feb 2009 10:45:07 -0000 1.2
+++ UrlReaders.java 1 Mar 2009 11:26:11 -0000 1.3
@@ -16,7 +16,7 @@
* UrlConnection.
*
* @author André van Toly
- * @version $Id: UrlReaders.java,v 1.2 2009/02/27 10:45:07 andre Exp $
+ * @version $Id: UrlReaders.java,v 1.3 2009/03/01 11:26:11 andre Exp $
*/
public class UrlReaders {
private static final Logger log =
Logging.getLoggerInstance(UrlReaders.class);
@@ -25,7 +25,7 @@
protected URL url = null;
protected static int contenttype = -1;
- public static UrlReader getUrlReader(URL url) throws IOException {
+ public static UrlReader getUrlReader(URL url) throws IOException,
MalformedURLException {
URLConnection uc = url.openConnection();
contenttype = MMGet.contentType(uc);
_______________________________________________
Cvs mailing list
[email protected]
http://lists.mmbase.org/mailman/listinfo/cvs