Update of /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget
In directory james.mmbase.org:/tmp/cvs-serv7713/src/org/mmbase/mmget

Modified Files:
        CSSReader.java HTMLReader.java MMGet.java 
        ResourceReWriter.java ResourceWriter.java 
Log Message:
with very large sites, for some reason, it runs out of connections. or rather 
my computer seems to do, all networked applications go on hold for a while


See also: 
http://cvs.mmbase.org/viewcvs/speeltuin/andre/mmget/src/org/mmbase/mmget


Index: CSSReader.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/CSSReader.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- CSSReader.java      11 Mar 2009 08:34:20 -0000      1.3
+++ CSSReader.java      23 Mar 2009 22:30:22 -0000      1.4
@@ -19,9 +19,9 @@
  *   @import url("mystyle.css");
  *
  * @author André van Toly
- * @version $Id: CSSReader.java,v 1.3 2009/03/11 08:34:20 andre Exp $
+ * @version $Id: CSSReader.java,v 1.4 2009/03/23 22:30:22 andre Exp $
  */
-public class CSSReader extends UrlReader {
+public final class CSSReader extends UrlReader {
     private static final Logger log = 
Logging.getLoggerInstance(CSSReader.class);
     
     private URLConnection uc = null;
@@ -87,6 +87,7 @@
     }
 
     public void close() throws IOException {
+        //log.debug("closing...");
         inrdr.close();
     }
     


Index: HTMLReader.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/HTMLReader.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- HTMLReader.java     11 Mar 2009 08:34:20 -0000      1.4
+++ HTMLReader.java     23 Mar 2009 22:30:22 -0000      1.5
@@ -15,9 +15,9 @@
  * Reads a web resource an returns its tags that may contain links to other 
resources. 
  *
  * @author André van Toly
- * @version $Id: HTMLReader.java,v 1.4 2009/03/11 08:34:20 andre Exp $
+ * @version $Id: HTMLReader.java,v 1.5 2009/03/23 22:30:22 andre Exp $
  */
-public class HTMLReader extends UrlReader {
+public final class HTMLReader extends UrlReader {
     private static final Logger log = 
Logging.getLoggerInstance(HTMLReader.class);
     
     private URLConnection uc = null;
@@ -141,6 +141,7 @@
     }
     
     public void close() throws IOException {
+        //log.debug("closing...");
         inrdr.close();
     }
 


Index: MMGet.java
===================================================================
RCS file: /var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/MMGet.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- MMGet.java  12 Mar 2009 10:55:20 -0000      1.9
+++ MMGet.java  23 Mar 2009 22:30:22 -0000      1.10
@@ -25,7 +25,7 @@
  * TODO: init rootURL early on, and check all urls against it (so we don't 
travel up the rootURL)
  *
  * @author André van Toly
- * @version $Id: MMGet.java,v 1.9 2009/03/12 10:55:20 andre Exp $
+ * @version $Id: MMGet.java,v 1.10 2009/03/23 22:30:22 andre Exp $
  */
 public final class MMGet {
     
@@ -49,8 +49,8 @@
 
     /* not wanted: offsite, already tried but 404 etc. */
     protected static Set<URL> ignoredURLs = new HashSet<URL>();
-    /* urls to parse (html, css) */
-    protected static List<URL> parseURLs = Collections.synchronizedList(new 
ArrayList<URL>());
+    /* urls to read (html, css) */
+    protected static List<URL> readURLs = Collections.synchronizedList(new 
ArrayList<URL>());
     /* saved: url -> filename */
     protected static Map<URL,String> savedURLs = 
Collections.synchronizedMap(new HashMap<URL,String>());
     /* rewrite these: url -> link in page / new link in rewritten page */
@@ -208,7 +208,7 @@
      * @param  link 
      */
     private String start() {
-        parseURLs.clear();
+        readURLs.clear();
         ignoredURLs.clear();
         savedURLs.clear();
         url2links.clear();
@@ -223,6 +223,7 @@
      * @param url   link to html page or css
      */
     private void readUrl(URL url) {
+        if (url == null) return;
         
log.debug("---------------------------------------------------------------------");
         log.debug("reading:   " + url.toString());
         
@@ -241,7 +242,7 @@
             log.error("Can't find '" + url + "' - " + e);
             return;
         }
-        if (reader == null) return;
+        //if (reader == null) return;
         
         try {
             ArrayList<String> links = reader.getLinks();
@@ -271,7 +272,7 @@
                         continue;
                     }
                 }
-                log.debug("link: " + linkURL.toString());
+                //log.debug("link: " + linkURL.toString());
                 
                 if (ignoredURLs.contains(linkURL)) continue;
                 if (!linkURL.getHost().equals(url.getHost())) {
@@ -279,19 +280,16 @@
                     ignoredURLs.add(linkURL);
                     continue;
                 }
+                /*
                 if (!linkURL.toString().startsWith(startdirURL.toString())) {
-                    // if (linkURL.toString().length() < 
startdirURL.toString().length()) {    // BUG: Klopt niet!
                     log.info(linkURL.toString() + " -- UP TREE, not 
following");
                     ignoredURLs.add(linkURL);
                     continue;
                 }
+                */
                     
-                String filename = null;
-                if (savedURLs.containsKey(linkURL)) {
-                    filename = savedURLs.get(linkURL);
-                    log.debug("already saved");
-                    
-                } else {
+                String filename = getSavedFilename(linkURL);    // already 
saved?
+                if (filename == null) {
                     ResourceWriter rw = null;
                     try {
                         rw = new ResourceWriter(linkURL);
@@ -299,23 +297,39 @@
                         
                         if (rw.getContentType() < 1) {
                             rw.write(); 
+                            rw.disconnect();
+                            
                         } else {
+                            if (rw.getContentType() == CONTENTTYPE_HTML
+                                && 
!linkURL.toString().startsWith(startdirURL.toString())) {
+                                    log.info(linkURL.toString() + " -- UP 
TREE, not following");
+                                    
+                                    if (!link.equals(linkURL.toString()) && 
!links2files.containsKey(link)) {
+                                        links2files.put(link, 
linkURL.toString());  // replace with full url
+                                    }
                             rw.disconnect();
-                            addParseURL(linkURL);   // save for later
-                            rw = null;
+                                    continue;
+                            }
+                            rw.disconnect();
+                            // save for later
+                            synchronized(readURLs) {
+                                if (!readURLs.contains(linkURL)) 
readURLs.add(linkURL);
+                            }
+                            
                         }
                     } catch(IOException e) {
                         log.error(e);
                         ignoredURLs.add(linkURL);
+                        continue;
                     }
-                    if (rw == null) continue;
                 }
                 
-                String calclink = serverpart + "/" + filename;    // 
'calculated' link
+                StringBuilder calclink = new StringBuilder(serverpart);
+                calclink.append("/").append(filename);    // 'calculated' link
                 String calcdir  = dirURL.toString();
                 if (calcdir.endsWith("/")) calcdir = calcdir.substring(0, 
calcdir.lastIndexOf("/"));
                 
-                String relative = UriParser.makeRelative(calcdir, calclink);
+                String relative = UriParser.makeRelative(calcdir, 
calclink.toString());
                 if (!"".equals(link) && !links2files.containsKey(link) && 
!link.equals(relative)) { // only when different
                     log.debug("link2files: " + link + " -> " + relative);
                     links2files.put(link, relative); /* /dir/css/bla.css + 
../css/bla.css */
@@ -327,18 +341,17 @@
             synchronized(url2links) {
                 if (!url2links.containsKey(url)) url2links.put(url, 
links2files);
             }
+            
             ResourceReWriter rrw = null;
             try {
                 rrw = new ResourceReWriter(url);
                 rrw.write();
-
             } catch (IOException e) {
                 log.error(e);
                 ignoredURLs.add(url);
             }
             
-            URL nextURL = getParseURL();
-            if (nextURL != null) readUrl(nextURL);  // recurse!
+            readUrl(getReadURL());  // recurse!
             
         } catch (IOException e) {
             log.error("IOException: " + e);
@@ -346,7 +359,7 @@
         
     }
     
-    protected static int contentType(URLConnection uc) {
+    protected final static int contentType(URLConnection uc) {
         String contentheader = uc.getHeaderField("content-type");
         //log.debug("header: " + contentheader);
         int pk = contentheader.indexOf(";");
@@ -391,7 +404,7 @@
     /**
      * remove ;jsessionid=a69bd9e162de1cfa3ea57ef6f3cf03af
      */
-    public static String removeSessionid(String str) {
+    public final static String removeSessionid(String str) {
         int pk = str.indexOf(";");
         if (pk > -1) {
             int q = str.indexOf("?");
@@ -411,7 +424,7 @@
      * @param  path the exact path from the startposition of the export 
(that's seen as 'root')
      * @return file
      */
-    public File getFile(String path) {
+    public final File getFile(String path) {
         File f;
         String resource;
         
@@ -440,7 +453,7 @@
      * @param   file    path or filename to check (not an URL!)
      * @return  true if it contains an extension like .html f.e.
      */
-     public static boolean hasExtension(String file) {
+     public static final boolean hasExtension(String file) {
         int i = file.lastIndexOf(".");
         return (i != -1 && i != file.length() - 1);
     }
@@ -454,16 +467,23 @@
         return pathList;
     }
 */
-    private void addParseURL(URL url) {
-        synchronized(parseURLs) {
-            if (!parseURLs.contains(url)) parseURLs.add(url);
+    protected String getSavedFilename(URL url) {
+        synchronized(savedURLs) {
+            return savedURLs.get(url);
+        }
         }
+
+    protected static void addSavedURL(URL url, String filename) {
+        synchronized(savedURLs) {
+            if (!savedURLs.containsKey(url)) savedURLs.put(url, filename);
     }
+    }
+
 
-    private URL getParseURL() {
+    private URL getReadURL() {
         URL url = null;
-        synchronized(parseURLs) {
-            if (!parseURLs.isEmpty()) url = parseURLs.remove(0);
+        synchronized(readURLs) {
+            if (!readURLs.isEmpty()) url = readURLs.remove(0);
         }
         return url;
     }


Index: ResourceReWriter.java
===================================================================
RCS file: 
/var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/ResourceReWriter.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- ResourceReWriter.java       23 Mar 2009 21:12:53 -0000      1.4
+++ ResourceReWriter.java       23 Mar 2009 22:30:22 -0000      1.5
@@ -12,9 +12,9 @@
  * Typically to be used for html and css files.
  *
  * @author Andr&eacute; van Toly
- * @version $Id: ResourceReWriter.java,v 1.4 2009/03/23 21:12:53 andre Exp $
+ * @version $Id: ResourceReWriter.java,v 1.5 2009/03/23 22:30:22 andre Exp $
  */
-public class ResourceReWriter extends ResourceWriter {
+public final class ResourceReWriter extends ResourceWriter {
     private static final Logger log = 
Logging.getLoggerInstance(ResourceReWriter.class);
     
     private URL url;
@@ -52,14 +52,18 @@
      * @param uc the already elsewhere created URLConnection for efficiency
      */
     private void rewrite() throws IOException {
-        log.debug("REwriting: " + url + " -> file: " + filename);
+        if (log.isDebugEnabled()) log.debug("REwriting: " + url + " -> file: " 
+ filename);
         File f = getFile(filename);
         if (f.exists()) {
             //log.warn("File '" + f.toString() + "' already exists, deleting 
it and saving again.");
             f.delete();
         }
         
-        Map<String,String> links2files = MMGet.url2links.get(url);        
+        Map<String,String> links2files = new HashMap<String,String>();
+        synchronized(MMGet.url2links) {
+            links2files = MMGet.url2links.remove(url);
+        }
+
         BufferedReader in = new BufferedReader(new 
InputStreamReader(url.openStream()));
         PrintWriter out = new PrintWriter(new FileWriter(f));
         String line;
@@ -83,6 +87,7 @@
                         if (!testlink.equals(link)) continue;
                         
                         line = line.replace(hitlink, file);
+                        if (log.isDebugEnabled()) 
                         log.debug("replaced '" + link + "' with '" + file + "' 
in: " + filename);
                     }
                 }
@@ -93,7 +98,7 @@
         in.close();
         out.close();
         
-        log.debug("Saved: " + url + " -> file: " + f.toString() );
+        if (log.isDebugEnabled()) log.debug("Saved: " + url + " -> file: " + 
f.toString() );
     
     }
     


Index: ResourceWriter.java
===================================================================
RCS file: 
/var/cvs/speeltuin/andre/mmget/src/org/mmbase/mmget/ResourceWriter.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- ResourceWriter.java 12 Mar 2009 10:55:20 -0000      1.4
+++ ResourceWriter.java 23 Mar 2009 22:30:22 -0000      1.5
@@ -13,7 +13,7 @@
  * Writes a resource found on an url to disk. 
  *
  * @author Andr&eacute; van Toly
- * @version $Id: ResourceWriter.java,v 1.4 2009/03/12 10:55:20 andre Exp $
+ * @version $Id: ResourceWriter.java,v 1.5 2009/03/23 22:30:22 andre Exp $
  */
 public class ResourceWriter {
     private static final Logger log = 
Logging.getLoggerInstance(ResourceWriter.class);
@@ -57,7 +57,7 @@
     
     protected void disconnect() {
         if (uc != null) { 
-            log.debug("disconnecting... " + url.toString());
+            //log.debug("disconnecting... " + url.toString());
             uc.disconnect(); 
         }
     }
@@ -67,11 +67,23 @@
      */
     protected void write() throws IOException {
         File f = getFile(filename);
+        
         if (f.exists()) {
             //log.warn("File '" + f.toString() + "' already exists, deleting 
it and saving again.");
+            if (f.lastModified() <= uc.getLastModified()) {
             f.delete();
+                
+            } else {
+                log.info("Not modified: " + f.toString() + ", f:" + 
f.lastModified() + " uc:" + uc.getLastModified());
+                // MMGet.savedURLs.put(url, filename);
+                MMGet.addSavedURL(url, filename);
+                
+                return;
         }
         
+        }
+        
+        
         BufferedInputStream  in  = new 
BufferedInputStream(uc.getInputStream());
         BufferedOutputStream out = new BufferedOutputStream(new 
FileOutputStream(f));
         byte[] buf = new byte[1024];
@@ -85,7 +97,8 @@
         out.close();
         
         log.debug("Saved: " + f.toString() );
-        MMGet.savedURLs.put(url, filename);
+        // MMGet.savedURLs.put(url, filename);
+        MMGet.addSavedURL(url, filename);
     }
     
     /**
@@ -96,7 +109,7 @@
      */
     private static URLConnection getURLConnection(URL url) throws 
SocketException, IOException {
         URLConnection uc = url.openConnection();
-        if (url.getProtocol().equals("http")) {
+        if (url.getProtocol().equals("http") || 
url.getProtocol().equals("https")) {
             HttpURLConnection huc = (HttpURLConnection)uc;
             int res = huc.getResponseCode();
             if (res == -1) {
@@ -108,11 +121,13 @@
             } else {
                 return huc;
             }
+        /*   
         } else if (url.getProtocol().equals("file")) {
             InputStream is = uc.getInputStream();
             is.close();
             // If that didn't throw an exception, the file is probably OK
             return uc;
+        */
         } else {
             // return "(non-HTTP)";
             return null;
@@ -164,22 +179,22 @@
         String filename = link.substring(MMGet.serverpart.length());
         if (filename.startsWith("/")) filename = filename.substring(1);
 
-        log.debug("0: file: " + filename);
+        //log.debug("0: file: " + filename);
         if (contenttype == MMGet.CONTENTTYPE_HTML) {
             if (filename.equals("")) {
                 filename = "index.html";
             } else if (!filename.endsWith("/") && 
!MMGet.hasExtension(filename)) {
                 filename = filename + "/index.html";
-                log.debug("1: /bla file: " + filename); // TODO: add extra ../ 
to rewritten links !!?
+                //log.debug("1: /bla file: " + filename); // TODO: add extra 
../ to rewritten links !!?
             }
             
             if (filename.endsWith("/")) {
                 filename = filename + "index.html";
-                log.debug("2: /bla/ file: " + filename);
+                //log.debug("2: /bla/ file: " + filename);
             }
         }
 
-        log.debug("url: " + url.toString() + " -> file: " + filename);
+        //log.debug("url: " + url.toString() + " -> file: " + filename);
         return filename;
     }
  }
_______________________________________________
Cvs mailing list
[email protected]
http://lists.mmbase.org/mailman/listinfo/cvs

Reply via email to