Update of /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget
In directory james.mmbase.org:/tmp/cvs-serv7713/src/org/mmbase/mmget
Modified Files:
CSSReader.java HTMLReader.java MMGet.java
ResourceReWriter.java ResourceWriter.java
Log Message:
with very large sites, for some reason, it runs out of connections. or rather
my computer seems to do, all networked applications go on hold for a while
See also:
http://cvs.mmbase.org/viewcvs/speeltuin/andre/mmget/src/org/mmbase/mmget
Index: CSSReader.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/CSSReader.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- CSSReader.java 11 Mar 2009 08:34:20 -0000 1.3
+++ CSSReader.java 23 Mar 2009 22:30:22 -0000 1.4
@@ -19,9 +19,9 @@
* @import url("mystyle.css");
*
* @author André van Toly
- * @version $Id: CSSReader.java,v 1.3 2009/03/11 08:34:20 andre Exp $
+ * @version $Id: CSSReader.java,v 1.4 2009/03/23 22:30:22 andre Exp $
*/
-public class CSSReader extends UrlReader {
+public final class CSSReader extends UrlReader {
private static final Logger log =
Logging.getLoggerInstance(CSSReader.class);
private URLConnection uc = null;
@@ -87,6 +87,7 @@
}
public void close() throws IOException {
+ //log.debug("closing...");
inrdr.close();
}
Index: HTMLReader.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/HTMLReader.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- HTMLReader.java 11 Mar 2009 08:34:20 -0000 1.4
+++ HTMLReader.java 23 Mar 2009 22:30:22 -0000 1.5
@@ -15,9 +15,9 @@
* Reads a web resource an returns its tags that may contain links to other
resources.
*
* @author André van Toly
- * @version $Id: HTMLReader.java,v 1.4 2009/03/11 08:34:20 andre Exp $
+ * @version $Id: HTMLReader.java,v 1.5 2009/03/23 22:30:22 andre Exp $
*/
-public class HTMLReader extends UrlReader {
+public final class HTMLReader extends UrlReader {
private static final Logger log =
Logging.getLoggerInstance(HTMLReader.class);
private URLConnection uc = null;
@@ -141,6 +141,7 @@
}
public void close() throws IOException {
+ //log.debug("closing...");
inrdr.close();
}
Index: MMGet.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/MMGet.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- MMGet.java 12 Mar 2009 10:55:20 -0000 1.9
+++ MMGet.java 23 Mar 2009 22:30:22 -0000 1.10
@@ -25,7 +25,7 @@
* TODO: init rootURL early on, and check all urls against it (so we don't
travel up the rootURL)
*
* @author André van Toly
- * @version $Id: MMGet.java,v 1.9 2009/03/12 10:55:20 andre Exp $
+ * @version $Id: MMGet.java,v 1.10 2009/03/23 22:30:22 andre Exp $
*/
public final class MMGet {
@@ -49,8 +49,8 @@
/* not wanted: offsite, already tried but 404 etc. */
protected static Set<URL> ignoredURLs = new HashSet<URL>();
- /* urls to parse (html, css) */
- protected static List<URL> parseURLs = Collections.synchronizedList(new
ArrayList<URL>());
+ /* urls to read (html, css) */
+ protected static List<URL> readURLs = Collections.synchronizedList(new
ArrayList<URL>());
/* saved: url -> filename */
protected static Map<URL,String> savedURLs =
Collections.synchronizedMap(new HashMap<URL,String>());
/* rewrite these: url -> link in page / new link in rewritten page */
@@ -208,7 +208,7 @@
* @param link
*/
private String start() {
- parseURLs.clear();
+ readURLs.clear();
ignoredURLs.clear();
savedURLs.clear();
url2links.clear();
@@ -223,6 +223,7 @@
* @param url link to html page or css
*/
private void readUrl(URL url) {
+ if (url == null) return;
log.debug("---------------------------------------------------------------------");
log.debug("reading: " + url.toString());
@@ -241,7 +242,7 @@
log.error("Can't find '" + url + "' - " + e);
return;
}
- if (reader == null) return;
+ //if (reader == null) return;
try {
ArrayList<String> links = reader.getLinks();
@@ -271,7 +272,7 @@
continue;
}
}
- log.debug("link: " + linkURL.toString());
+ //log.debug("link: " + linkURL.toString());
if (ignoredURLs.contains(linkURL)) continue;
if (!linkURL.getHost().equals(url.getHost())) {
@@ -279,19 +280,16 @@
ignoredURLs.add(linkURL);
continue;
}
+ /*
if (!linkURL.toString().startsWith(startdirURL.toString())) {
- // if (linkURL.toString().length() <
startdirURL.toString().length()) { // BUG: Klopt niet!
log.info(linkURL.toString() + " -- UP TREE, not
following");
ignoredURLs.add(linkURL);
continue;
}
+ */
- String filename = null;
- if (savedURLs.containsKey(linkURL)) {
- filename = savedURLs.get(linkURL);
- log.debug("already saved");
-
- } else {
+ String filename = getSavedFilename(linkURL); // already
saved?
+ if (filename == null) {
ResourceWriter rw = null;
try {
rw = new ResourceWriter(linkURL);
@@ -299,23 +297,39 @@
if (rw.getContentType() < 1) {
rw.write();
+ rw.disconnect();
+
} else {
+ if (rw.getContentType() == CONTENTTYPE_HTML
+ &&
!linkURL.toString().startsWith(startdirURL.toString())) {
+ log.info(linkURL.toString() + " -- UP
TREE, not following");
+
+ if (!link.equals(linkURL.toString()) &&
!links2files.containsKey(link)) {
+ links2files.put(link,
linkURL.toString()); // replace with full url
+ }
rw.disconnect();
- addParseURL(linkURL); // save for later
- rw = null;
+ continue;
+ }
+ rw.disconnect();
+ // save for later
+ synchronized(readURLs) {
+ if (!readURLs.contains(linkURL))
readURLs.add(linkURL);
+ }
+
}
} catch(IOException e) {
log.error(e);
ignoredURLs.add(linkURL);
+ continue;
}
- if (rw == null) continue;
}
- String calclink = serverpart + "/" + filename; //
'calculated' link
+ StringBuilder calclink = new StringBuilder(serverpart);
+ calclink.append("/").append(filename); // 'calculated' link
String calcdir = dirURL.toString();
if (calcdir.endsWith("/")) calcdir = calcdir.substring(0,
calcdir.lastIndexOf("/"));
- String relative = UriParser.makeRelative(calcdir, calclink);
+ String relative = UriParser.makeRelative(calcdir,
calclink.toString());
if (!"".equals(link) && !links2files.containsKey(link) &&
!link.equals(relative)) { // only when different
log.debug("link2files: " + link + " -> " + relative);
links2files.put(link, relative); /* /dir/css/bla.css +
../css/bla.css */
@@ -327,18 +341,17 @@
synchronized(url2links) {
if (!url2links.containsKey(url)) url2links.put(url,
links2files);
}
+
ResourceReWriter rrw = null;
try {
rrw = new ResourceReWriter(url);
rrw.write();
-
} catch (IOException e) {
log.error(e);
ignoredURLs.add(url);
}
- URL nextURL = getParseURL();
- if (nextURL != null) readUrl(nextURL); // recurse!
+ readUrl(getReadURL()); // recurse!
} catch (IOException e) {
log.error("IOException: " + e);
@@ -346,7 +359,7 @@
}
- protected static int contentType(URLConnection uc) {
+ protected final static int contentType(URLConnection uc) {
String contentheader = uc.getHeaderField("content-type");
//log.debug("header: " + contentheader);
int pk = contentheader.indexOf(";");
@@ -391,7 +404,7 @@
/**
* remove ;jsessionid=a69bd9e162de1cfa3ea57ef6f3cf03af
*/
- public static String removeSessionid(String str) {
+ public final static String removeSessionid(String str) {
int pk = str.indexOf(";");
if (pk > -1) {
int q = str.indexOf("?");
@@ -411,7 +424,7 @@
* @param path the exact path from the startposition of the export
(that's seen as 'root')
* @return file
*/
- public File getFile(String path) {
+ public final File getFile(String path) {
File f;
String resource;
@@ -440,7 +453,7 @@
* @param file path or filename to check (not an URL!)
* @return true if it contains an extension like .html f.e.
*/
- public static boolean hasExtension(String file) {
+ public static final boolean hasExtension(String file) {
int i = file.lastIndexOf(".");
return (i != -1 && i != file.length() - 1);
}
@@ -454,16 +467,23 @@
return pathList;
}
*/
- private void addParseURL(URL url) {
- synchronized(parseURLs) {
- if (!parseURLs.contains(url)) parseURLs.add(url);
+ protected String getSavedFilename(URL url) {
+ synchronized(savedURLs) {
+ return savedURLs.get(url);
+ }
}
+
+ protected static void addSavedURL(URL url, String filename) {
+ synchronized(savedURLs) {
+ if (!savedURLs.containsKey(url)) savedURLs.put(url, filename);
}
+ }
+
- private URL getParseURL() {
+ private URL getReadURL() {
URL url = null;
- synchronized(parseURLs) {
- if (!parseURLs.isEmpty()) url = parseURLs.remove(0);
+ synchronized(readURLs) {
+ if (!readURLs.isEmpty()) url = readURLs.remove(0);
}
return url;
}
Index: ResourceReWriter.java
===================================================================
RCS file:
/var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/ResourceReWriter.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- ResourceReWriter.java 23 Mar 2009 21:12:53 -0000 1.4
+++ ResourceReWriter.java 23 Mar 2009 22:30:22 -0000 1.5
@@ -12,9 +12,9 @@
* Typically to be used for html and css files.
*
* @author André van Toly
- * @version $Id: ResourceReWriter.java,v 1.4 2009/03/23 21:12:53 andre Exp $
+ * @version $Id: ResourceReWriter.java,v 1.5 2009/03/23 22:30:22 andre Exp $
*/
-public class ResourceReWriter extends ResourceWriter {
+public final class ResourceReWriter extends ResourceWriter {
private static final Logger log =
Logging.getLoggerInstance(ResourceReWriter.class);
private URL url;
@@ -52,14 +52,18 @@
* @param uc the already elsewhere created URLConnection for efficiency
*/
private void rewrite() throws IOException {
- log.debug("REwriting: " + url + " -> file: " + filename);
+ if (log.isDebugEnabled()) log.debug("REwriting: " + url + " -> file: "
+ filename);
File f = getFile(filename);
if (f.exists()) {
//log.warn("File '" + f.toString() + "' already exists, deleting
it and saving again.");
f.delete();
}
- Map<String,String> links2files = MMGet.url2links.get(url);
+ Map<String,String> links2files = new HashMap<String,String>();
+ synchronized(MMGet.url2links) {
+ links2files = MMGet.url2links.remove(url);
+ }
+
BufferedReader in = new BufferedReader(new
InputStreamReader(url.openStream()));
PrintWriter out = new PrintWriter(new FileWriter(f));
String line;
@@ -83,6 +87,7 @@
if (!testlink.equals(link)) continue;
line = line.replace(hitlink, file);
+ if (log.isDebugEnabled())
log.debug("replaced '" + link + "' with '" + file + "'
in: " + filename);
}
}
@@ -93,7 +98,7 @@
in.close();
out.close();
- log.debug("Saved: " + url + " -> file: " + f.toString() );
+ if (log.isDebugEnabled()) log.debug("Saved: " + url + " -> file: " +
f.toString() );
}
Index: ResourceWriter.java
===================================================================
RCS file:
/var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/ResourceWriter.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- ResourceWriter.java 12 Mar 2009 10:55:20 -0000 1.4
+++ ResourceWriter.java 23 Mar 2009 22:30:22 -0000 1.5
@@ -13,7 +13,7 @@
* Writes a resource found on an url to disk.
*
* @author André van Toly
- * @version $Id: ResourceWriter.java,v 1.4 2009/03/12 10:55:20 andre Exp $
+ * @version $Id: ResourceWriter.java,v 1.5 2009/03/23 22:30:22 andre Exp $
*/
public class ResourceWriter {
private static final Logger log =
Logging.getLoggerInstance(ResourceWriter.class);
@@ -57,7 +57,7 @@
protected void disconnect() {
if (uc != null) {
- log.debug("disconnecting... " + url.toString());
+ //log.debug("disconnecting... " + url.toString());
uc.disconnect();
}
}
@@ -67,11 +67,23 @@
*/
protected void write() throws IOException {
File f = getFile(filename);
+
if (f.exists()) {
//log.warn("File '" + f.toString() + "' already exists, deleting
it and saving again.");
+ if (f.lastModified() <= uc.getLastModified()) {
f.delete();
+
+ } else {
+ log.info("Not modified: " + f.toString() + ", f:" +
f.lastModified() + " uc:" + uc.getLastModified());
+ // MMGet.savedURLs.put(url, filename);
+ MMGet.addSavedURL(url, filename);
+
+ return;
}
+ }
+
+
BufferedInputStream in = new
BufferedInputStream(uc.getInputStream());
BufferedOutputStream out = new BufferedOutputStream(new
FileOutputStream(f));
byte[] buf = new byte[1024];
@@ -85,7 +97,8 @@
out.close();
log.debug("Saved: " + f.toString() );
- MMGet.savedURLs.put(url, filename);
+ // MMGet.savedURLs.put(url, filename);
+ MMGet.addSavedURL(url, filename);
}
/**
@@ -96,7 +109,7 @@
*/
private static URLConnection getURLConnection(URL url) throws
SocketException, IOException {
URLConnection uc = url.openConnection();
- if (url.getProtocol().equals("http")) {
+ if (url.getProtocol().equals("http") ||
url.getProtocol().equals("https")) {
HttpURLConnection huc = (HttpURLConnection)uc;
int res = huc.getResponseCode();
if (res == -1) {
@@ -108,11 +121,13 @@
} else {
return huc;
}
+ /*
} else if (url.getProtocol().equals("file")) {
InputStream is = uc.getInputStream();
is.close();
// If that didn't throw an exception, the file is probably OK
return uc;
+ */
} else {
// return "(non-HTTP)";
return null;
@@ -164,22 +179,22 @@
String filename = link.substring(MMGet.serverpart.length());
if (filename.startsWith("/")) filename = filename.substring(1);
- log.debug("0: file: " + filename);
+ //log.debug("0: file: " + filename);
if (contenttype == MMGet.CONTENTTYPE_HTML) {
if (filename.equals("")) {
filename = "index.html";
} else if (!filename.endsWith("/") &&
!MMGet.hasExtension(filename)) {
filename = filename + "/index.html";
- log.debug("1: /bla file: " + filename); // TODO: add extra ../
to rewritten links !!?
+ //log.debug("1: /bla file: " + filename); // TODO: add extra
../ to rewritten links !!?
}
if (filename.endsWith("/")) {
filename = filename + "index.html";
- log.debug("2: /bla/ file: " + filename);
+ //log.debug("2: /bla/ file: " + filename);
}
}
- log.debug("url: " + url.toString() + " -> file: " + filename);
+ //log.debug("url: " + url.toString() + " -> file: " + filename);
return filename;
}
}
_______________________________________________
Cvs mailing list
[email protected]
http://lists.mmbase.org/mailman/listinfo/cvs