[YaCy-svn] r4507 - in trunk: htroot source/de/anomic/htmlFilter source/de/anomic/http source/de/anomic/plasma source/de/anomic/plasma/parser/rss source/de/anomic/plasma/parser/tar source/de/anomic/plasma/parser/zip

orbiter at BerliOS Mon, 25 Feb 2008 06:09:51 -0800

Author: orbiter
Date: 2008-02-25 15:08:15 +0100 (Mon, 25 Feb 2008)
New Revision: 4507


Modified:
   trunk/htroot/CacheAdmin_p.java
   trunk/htroot/ViewFile.java
   trunk/htroot/ViewImage.java
   trunk/htroot/yacysearchitem.html
   trunk/htroot/yacysearchitem.java
   trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
   trunk/source/de/anomic/http/httpdFileHandler.java
   trunk/source/de/anomic/plasma/parser/rss/rssParser.java
   trunk/source/de/anomic/plasma/parser/tar/tarParser.java
   trunk/source/de/anomic/plasma/parser/zip/zipParser.java
   trunk/source/de/anomic/plasma/plasmaCondenser.java
   trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java
   trunk/source/de/anomic/plasma/plasmaCrawlNURL.java
   trunk/source/de/anomic/plasma/plasmaParser.java
   trunk/source/de/anomic/plasma/plasmaParserDocument.java
   trunk/source/de/anomic/plasma/plasmaSearchImages.java
   trunk/source/de/anomic/plasma/plasmaSnippetCache.java
Log:
- enhanced recognition, parsing, management and double-occurrence-handling of 
image tags
- enhanced text parser (condenser): found and eliminated bad code parts; 
increase of speed
- added handling of image preview using the image cache from HTCACHE
- some other minor changes

Modified: trunk/htroot/CacheAdmin_p.java
===================================================================
--- trunk/htroot/CacheAdmin_p.java      2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/CacheAdmin_p.java      2008-02-25 14:08:15 UTC (rev 4507)
@@ -54,6 +54,7 @@
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.Writer;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
@@ -313,8 +314,8 @@
         prop.put("info_type_use." + extension, (i == 0) ? 0 : 1);
     }
 
-    private static void formatImageAnchor(serverObjects prop, 
TreeSet<htmlFilterImageEntry> anchor) {
-        final Iterator<htmlFilterImageEntry> iter = anchor.iterator();
+    private static void formatImageAnchor(serverObjects prop, HashMap<String, 
htmlFilterImageEntry> anchor) {
+        final Iterator<htmlFilterImageEntry> iter = anchor.values().iterator();
         htmlFilterImageEntry ie;
         prop.put("info_type_use.images_images", anchor.size());
         int i = 0;

Modified: trunk/htroot/ViewFile.java
===================================================================
--- trunk/htroot/ViewFile.java  2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/ViewFile.java  2008-02-25 14:08:15 UTC (rev 4507)
@@ -49,9 +49,9 @@
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.TreeSet;
 
 import de.anomic.data.htmlTools;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
@@ -339,8 +339,8 @@
                 i += putMediaInfo(prop, wordArray, i, 
document.getAudiolinks(), "audio", (i % 2 == 0));
                 dark = (i % 2 == 0);
                 
-                TreeSet<htmlFilterImageEntry> ts = document.getImages();
-                Iterator<htmlFilterImageEntry> tsi = ts.iterator();
+                HashMap<String, htmlFilterImageEntry> ts = 
document.getImages();
+                Iterator<htmlFilterImageEntry> tsi = ts.values().iterator();
                 htmlFilterImageEntry entry;
                 while (tsi.hasNext()) {
                     entry = tsi.next();

Modified: trunk/htroot/ViewImage.java
===================================================================
--- trunk/htroot/ViewImage.java 2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/ViewImage.java 2008-02-25 14:08:15 UTC (rev 4507)
@@ -135,47 +135,47 @@
             // find original size
             int h = image.getHeight(null);
             int w = image.getWidth(null);
-
-            // System.out.println("DEBUG: get access to image " +
-            // url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT
-            // authorized"));
-
+            
             // in case of not-authorized access shrink the image to prevent
-            // copyright problems
-            // so that images are not larger than thumbnails
-            if ((!auth) && ((w > 16) || (h > 16))) {
+            // copyright problems, so that images are not larger than 
thumbnails
+            if (auth) {
+                maxwidth = (maxwidth == 0) ? w : maxwidth;
+                maxheight = (maxheight == 0) ? h : maxheight;
+            } else if ((w > 16) || (h > 16)) {
                 maxwidth = (int) Math.min(64.0, w * 0.6);
                 maxheight = (int) Math.min(64.0, h * 0.6);
+            } else {
+                maxwidth = 16;
+                maxheight = 16;
             }
 
             // calculate width & height from maxwidth & maxheight
-            if ((maxwidth != 0) || (maxheight != 0)) {
+            if ((maxwidth < w) || (maxheight < h)) {
+                // scale image
                 double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / 
((double) w);
                 double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / 
((double) h);
                 double scale = Math.min(hs, vs);
                 if (!auth) scale = Math.min(scale, 0.6); // this is for 
copyright purpose
                 if (scale < 1.0) {
-                    width = (int) (w * scale);
-                    height = (int) (h * scale);
+                    width = Math.max(1, (int) (w * scale));
+                    height = Math.max(1, (int) (h * scale));
                 } else {
-                    width = w;
-                    height = h;
+                    width = Math.max(1, w);
+                    height = Math.max(1, h);
                 }
+                
+                // compute scaled image
+                scaled = ((w == width) && (h == height)) ? image : 
image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
+                MediaTracker mediaTracker = new MediaTracker(new Container());
+                mediaTracker.addImage(scaled, 0);
+                try {mediaTracker.waitForID(0);} catch (InterruptedException 
e) {}
             } else {
+                // do not scale
                 width = w;
                 height = h;
+                scaled = image;
             }
 
-            // check for minimum values
-            width = Math.max(width, 1);
-            height = Math.max(height, 1);
-
-            // scale image
-            scaled = ((w == width) && (h == height)) ? image : 
image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
-            MediaTracker mediaTracker = new MediaTracker(new Container());
-            mediaTracker.addImage(scaled, 0);
-            try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
-
             if ((height == 16) && (width == 16) && (resource != null)) {
                 // this might be a favicon, store image to cache for faster 
re-load later on
                 iconcache.put(urlString, scaled);

Modified: trunk/htroot/yacysearchitem.html
===================================================================
--- trunk/htroot/yacysearchitem.html    2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/yacysearchitem.html    2008-02-25 14:08:15 UTC (rev 4507)
@@ -22,7 +22,7 @@
   ::
   #{items}#
   <div class="thumbcontainer">
-    <a href="#[href]#" class="thumblink" onclick="return hs.expand(this)">
+    <a href="#[hrefCache]#" class="thumblink" onclick="return hs.expand(this)">
       <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" 
alt="#[name]#">
     </a>
     <div class="highslide-caption"><a href="#[href]#">#[name]#<br \><a 
href="#[source]#">#[sourcedom]#</a></a></div>

Modified: trunk/htroot/yacysearchitem.java
===================================================================
--- trunk/htroot/yacysearchitem.java    2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/yacysearchitem.java    2008-02-25 14:08:15 UTC (rev 4507)
@@ -67,6 +67,7 @@
         boolean rss = post.get("rss", "false").equals("true");
         boolean authenticated = sb.adminAuthenticated(header) >= 2;
         int item = post.getInt("item", -1);
+        boolean auth = ((String) 
header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || 
sb.verifyAuthentication(header, true);
         
         // default settings for blank item
         prop.put("content", "0");
@@ -233,6 +234,7 @@
             if (ms == null) {
                 prop.put("content_items", "0");
             } else {
+                prop.putHTML("content_items_0_hrefCache", (auth) ? 
"/ViewImage.png?url=" + ms.href.toNormalform(true, false) : 
ms.href.toNormalform(true, false));
                 prop.putHTML("content_items_0_href", 
ms.href.toNormalform(true, false));
                 prop.put("content_items_0_code", 
sb.licensedURLs.aquireLicense(ms.href));
                 prop.putHTML("content_items_0_name", shorten(ms.name, 
namelength));

Modified: trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
===================================================================
--- trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java     
2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java     
2008-02-25 14:08:15 UTC (rev 4507)
@@ -54,6 +54,7 @@
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -102,7 +103,7 @@
 
     // class variables: collectors for links
     private HashMap<yacyURL, String> anchors;
-    private TreeSet<htmlFilterImageEntry> images; // String(absolute 
url)/ImageEntry relation
+    private HashMap<String, htmlFilterImageEntry> images; // urlhash/image 
relation
     private HashMap<String, String> metas;
     private String title;
     //private String headline;
@@ -127,7 +128,7 @@
         super(linkTags0, linkTags1);
         this.root = root;
         this.anchors = new HashMap<yacyURL, String>();
-        this.images = new TreeSet<htmlFilterImageEntry>();
+        this.images = new HashMap<String, htmlFilterImageEntry>();
         this.metas = new HashMap<String, String>();
         this.title = "";
         this.headlines = new ArrayList[4];
@@ -178,7 +179,7 @@
             } catch (NumberFormatException e) {}
             yacyURL url = absolutePath(tagopts.getProperty("src", ""));
             htmlFilterImageEntry ie = new htmlFilterImageEntry(url, 
tagopts.getProperty("alt",""), width, height);
-            images.add(ie);
+            addImage(images, ie);
         }
         if (tagname.equalsIgnoreCase("base")) try {
             root = new yacyURL(tagopts.getProperty("href", ""), null);
@@ -212,7 +213,7 @@
 
                 if (type.equalsIgnoreCase("shortcut icon")) {
                     htmlFilterImageEntry ie = new 
htmlFilterImageEntry(newLink, linktitle, -1,-1);
-                    images.add(ie);    
+                    images.put(ie.url().hash(), ie);    
                     this.favicon = newLink;
                 } else if (!type.equalsIgnoreCase("stylesheet") && 
!type.equalsIgnoreCase("alternate stylesheet")) {
                     anchors.put(newLink, linktitle);
@@ -234,12 +235,24 @@
         // fire event
         fireScrapeTag0(tagname, tagopts);
     }
-
+    
     public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
         // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + 
tagopts.toString() + ", text=" + new String(text));
         if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
             String href = tagopts.getProperty("href", "");
-            if (href.length() > 0) anchors.put(absolutePath(href), 
super.stripAll(new serverCharBuffer(text)).trim().toString());
+            if (href.length() > 0) {
+                yacyURL url = absolutePath(href);
+                String f = url.getFile();
+                int p = f.lastIndexOf('.');
+                String type = (p < 0) ? "" : f.substring(p + 1);
+                if (type.equals("png") || type.equals("gif") || 
type.equals("jpg") || type.equals("jpeg")) {
+                    // special handling of such urls: put them to the image 
urls
+                    htmlFilterImageEntry ie = new htmlFilterImageEntry(url, 
super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
+                    addImage(images, ie);
+                } else {
+                    anchors.put(url, super.stripAll(new 
serverCharBuffer(text)).trim().toString());
+                }
+            }
         }
         String h;
         if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@@ -348,7 +361,7 @@
         return anchors;
     }
 
-    public TreeSet<htmlFilterImageEntry> getImages() {
+    public HashMap<String, htmlFilterImageEntry> getImages() {
         // this resturns a String(absolute url)/htmlFilterImageEntry - relation
         return images;
     }
@@ -522,5 +535,24 @@
         
         return scraper;
     }
+    
+    public static void addAllImages(HashMap<String, htmlFilterImageEntry> a, 
HashMap<String, htmlFilterImageEntry> b) {
+        Iterator<Map.Entry<String, htmlFilterImageEntry>> i = 
b.entrySet().iterator();
+        Map.Entry<String, htmlFilterImageEntry> ie;
+        while (i.hasNext()) {
+            ie = i.next();
+            addImage(a, ie.getValue());
+        }
+    }
+    
+    public static void addImage(HashMap<String, htmlFilterImageEntry> a, 
htmlFilterImageEntry ie) {
+        if (a.containsKey(ie.url().hash())) {
+            // in case of a collision, take that image that has the better 
image size tags
+            if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(), 
ie);
+        } else {
+            a.put(ie.url().hash(), ie);
+        }
+    }
+    
 }
 

Modified: trunk/source/de/anomic/http/httpdFileHandler.java
===================================================================
--- trunk/source/de/anomic/http/httpdFileHandler.java   2008-02-24 21:18:04 UTC 
(rev 4506)
+++ trunk/source/de/anomic/http/httpdFileHandler.java   2008-02-25 14:08:15 UTC 
(rev 4507)
@@ -452,6 +452,7 @@
                     sb.append("<html>\n<head>\n</head>\n<body>\n<h1>Index of " 
+ path + "</h1>\n  <ul>\n");
                     File dir = new File(htDocsPath, path);
                     String[] list = dir.list();
+                    if (list == null) list = new String[0]; // should not 
occur!
                     File f;
                     String size;
                     long sz;

Modified: trunk/source/de/anomic/plasma/parser/rss/rssParser.java
===================================================================
--- trunk/source/de/anomic/plasma/parser/rss/rssParser.java     2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/parser/rss/rssParser.java     2008-02-25 
14:08:15 UTC (rev 4507)
@@ -50,7 +50,6 @@
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 
 import de.anomic.htmlFilter.htmlFilterAbstractScraper;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@@ -97,7 +96,7 @@
         try {
             LinkedList<String> feedSections = new LinkedList<String>();
             HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> images  = new 
TreeSet<htmlFilterImageEntry>();
+            HashMap<String, htmlFilterImageEntry> images  = new 
HashMap<String, htmlFilterImageEntry>();
             serverByteBuffer text = new serverByteBuffer();
             serverCharBuffer authors = new serverCharBuffer();
             
@@ -114,7 +113,8 @@
             String feedDescription = reader.getChannel().getDescription();
             
             if (reader.getImage() != null) {
-                images.add(new htmlFilterImageEntry(new 
yacyURL(reader.getImage(), null), feedTitle, -1, -1));
+                yacyURL imgURL = new yacyURL(reader.getImage(), null);
+                images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL, 
feedTitle, -1, -1));
             }            
             
             // loop through the feed items
@@ -154,9 +154,9 @@
                             anchors.putAll(itemLinks);
                         }
                         
-                        TreeSet<htmlFilterImageEntry> itemImages = 
scraper.getImages();
+                        HashMap<String, htmlFilterImageEntry> itemImages = 
scraper.getImages();
                         if ((itemImages != null) && (itemImages.size() > 0)) {
-                            images.addAll(itemImages);
+                            htmlFilterContentScraper.addAllImages(images, 
itemImages);
                         }
                         
                         byte[] extractedText = scraper.getText();

Modified: trunk/source/de/anomic/plasma/parser/tar/tarParser.java
===================================================================
--- trunk/source/de/anomic/plasma/parser/tar/tarParser.java     2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/parser/tar/tarParser.java     2008-02-25 
14:08:15 UTC (rev 4507)
@@ -53,12 +53,12 @@
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.zip.GZIPInputStream;
 
 import com.ice.tar.TarEntry;
 import com.ice.tar.TarInputStream;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@@ -132,7 +132,7 @@
             StringBuffer docAbstrct = new StringBuffer();
 
             Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> docImages = new 
TreeSet<htmlFilterImageEntry>(); 
+            HashMap<String, htmlFilterImageEntry> docImages = new 
HashMap<String, htmlFilterImageEntry>(); 
                         
             // looping through the contained files
             TarEntry entry;
@@ -193,7 +193,7 @@
                 }               
                 
                 docAnchors.putAll(subDoc.getAnchors());
-                docImages.addAll(subDoc.getImages());
+                htmlFilterContentScraper.addAllImages(docImages, 
subDoc.getImages());
                 
                 // release subdocument
                 subDoc.close();

Modified: trunk/source/de/anomic/plasma/parser/zip/zipParser.java
===================================================================
--- trunk/source/de/anomic/plasma/parser/zip/zipParser.java     2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/parser/zip/zipParser.java     2008-02-25 
14:08:15 UTC (rev 4507)
@@ -53,10 +53,10 @@
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@@ -115,7 +115,7 @@
             LinkedList<String> docSections = new LinkedList<String>();
             StringBuffer docAbstrct = new StringBuffer();
             Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> docImages = new 
TreeSet<htmlFilterImageEntry>(); 
+            HashMap<String, htmlFilterImageEntry> docImages = new 
HashMap<String, htmlFilterImageEntry>(); 
             
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();            
@@ -176,7 +176,7 @@
                 }
                 
                 docAnchors.putAll(subDoc.getAnchors());
-                docImages.addAll(subDoc.getImages());
+                htmlFilterContentScraper.addAllImages(docImages, 
subDoc.getImages());
                 
                 // release subdocument
                 subDoc.close();

Modified: trunk/source/de/anomic/plasma/plasmaCondenser.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCondenser.java  2008-02-24 21:18:04 UTC 
(rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaCondenser.java  2008-02-25 14:08:15 UTC 
(rev 4507)
@@ -107,6 +107,19 @@
     
     private final static int numlength = 5;
 
+    // initialize array of invisible characters
+    private static boolean[] invisibleChar = new boolean['z' - ' ' + 1];
+    static {
+        // initialize array of invisible charachters
+        String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\";
+        for (int i = ' '; i <= 'z'; i++) {
+            invisibleChar[i - ' '] = false;
+        }
+        for (int i = 0; i < invisibleString.length(); i++) {
+            invisibleChar[invisibleString.charAt(i) - ' '] = true;
+        }
+    }
+    
     //private Properties analysis;
     private TreeMap<String, wordStatProp> words; // a string (the words) to 
(wordStatProp) - relation
     private HashMap<StringBuffer, phraseStatProp> sentences;
@@ -198,7 +211,7 @@
             }
 
             // images
-            Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
+            Iterator<htmlFilterImageEntry> j = 
document.getImages().values().iterator();
             htmlFilterImageEntry ientry;
             while (j.hasNext()) {
                 ientry = j.next();
@@ -659,7 +672,7 @@
     public final static boolean invisible(char c) {
         // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
         if ((c < ' ') || (c > 'z')) return true;
-        return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
+        return invisibleChar[c - ' '];
     }
 
     public static Enumeration<StringBuffer> wordTokenizer(String s, String 
charset, int minLength) {
@@ -727,7 +740,7 @@
 
         public unsievedWordsEnum(InputStream is, String charset) throws 
UnsupportedEncodingException {
             e = new sentencesFromInputStreamEnum(is, charset);
-            s = new StringBuffer();
+            s = new StringBuffer(20);
             buffer = nextElement0();
         }
 
@@ -859,9 +872,9 @@
     }
 
     static StringBuffer readSentence(Reader reader, boolean pre) throws 
IOException {
-        StringBuffer s = new StringBuffer();
+        StringBuffer s = new StringBuffer(20);
         int nextChar;
-        char c;
+        char c, lc = (char) 0;
         
         // find sentence end
         for (;;) {
@@ -871,20 +884,14 @@
                 if (s.length() == 0) return null; else break;
             }
             c = (char) nextChar;
+            if (pre && ((c == (char) 10) || (c == (char) 13))) break;
+            if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = ' 
';
+            if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces
             s.append(c);
-            if (pre) {
-                if ((c == (char) 10) || (c == (char) 13)) break;
-            } else {
-                if (htmlFilterContentScraper.punctuation(c)) break;
-            }
+            if (htmlFilterContentScraper.punctuation(c)) break;
+            lc = c;
         }
-
-        // replace line endings and tabs by blanks
-        for (int i = 0; i < s.length(); i++) {
-            if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || 
(s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
-        }
-        // remove all double-spaces
-        int p; while ((p = s.indexOf("  ")) >= 0) s.deleteCharAt(p);
+        
         return s;
     }
 

Modified: trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java      2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java      2008-02-25 
14:08:15 UTC (rev 4507)
@@ -130,7 +130,10 @@
     }
     
     public void finalize() {
-        if (urlFileStack != null) close();
+        if (urlFileStack != null) {
+            serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + 
stackname + " closed by finalizer");
+            close();
+        }
     }
     
     public synchronized void clear() {

Modified: trunk/source/de/anomic/plasma/plasmaCrawlNURL.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlNURL.java  2008-02-24 21:18:04 UTC 
(rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaCrawlNURL.java  2008-02-25 14:08:15 UTC 
(rev 4507)
@@ -49,6 +49,8 @@
 import java.util.HashSet;
 import java.util.Iterator;
 
+import de.anomic.server.logging.serverLog;
+
 public class plasmaCrawlNURL {
     
     public static final int STACK_TYPE_NULL     =  0; // do not stack
@@ -64,9 +66,9 @@
     private static final long minimumGlobalDelta = 500; // the minimum time 
difference between access of the same global domain
     private static final long maximumDomAge =  60000; // the maximum age of a 
domain until it is used for another crawl attempt
     
-    private final plasmaCrawlBalancer coreStack;      // links found by 
crawling to depth-1
-    private final plasmaCrawlBalancer limitStack;     // links found by 
crawling at target depth
-    private final plasmaCrawlBalancer remoteStack;    // links from remote 
crawl orders
+    private plasmaCrawlBalancer coreStack;      // links found by crawling to 
depth-1
+    private plasmaCrawlBalancer limitStack;     // links found by crawling at 
target depth
+    private plasmaCrawlBalancer remoteStack;    // links from remote crawl 
orders
     //private final plasmaCrawlBalancer overhangStack;  // links found by 
crawling at depth+1
     //private kelondroStack imageStack;     // links pointing to image 
resources
     //private kelondroStack movieStack;     // links pointing to movie 
resources
@@ -81,12 +83,28 @@
     }
 
     public void close() {
-        coreStack.close();
-        limitStack.close();
+        if (coreStack != null) {
+            coreStack.close();
+            coreStack = null;
+        }
+        if (limitStack != null) {
+            limitStack.close();
+            limitStack = null;
+        }
         //overhangStack.close();
-        remoteStack.close();
+        if (remoteStack != null) {
+            remoteStack.close();
+            remoteStack = null;
+        }
     }
     
+    public void finalize() {
+        if ((coreStack != null) || (limitStack != null) || (remoteStack != 
null)) {
+            serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by 
finalizer");
+            close();
+        }
+    }
+    
     public boolean notEmpty() {
         return coreStack.notEmpty() || limitStack.notEmpty() || 
remoteStack.notEmpty();
     }

Modified: trunk/source/de/anomic/plasma/plasmaParser.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaParser.java     2008-02-24 21:18:04 UTC 
(rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaParser.java     2008-02-25 14:08:15 UTC 
(rev 4507)
@@ -39,6 +39,7 @@
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
@@ -747,7 +748,7 @@
         
     }
     
-    static Map<yacyURL, String> allReflinks(Set<?> links) {
+    static Map<yacyURL, String> allReflinks(Collection<?> links) {
         // links is either a Set of Strings (with urls) or 
htmlFilterImageEntries
         // we find all links that are part of a reference inside a url
         HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
@@ -786,7 +787,7 @@
         return v;
     }
     
-    static Map<yacyURL, String> allSubpaths(Set<?> links) {
+    static Map<yacyURL, String> allSubpaths(Collection<?> links) {
         // links is either a Set of Strings (urls) or a Set of 
htmlFilterImageEntries
         HashSet<String> h = new HashSet<String>();
         Iterator<?> i = links.iterator();

Modified: trunk/source/de/anomic/plasma/plasmaParserDocument.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaParserDocument.java     2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaParserDocument.java     2008-02-25 
14:08:15 UTC (rev 4507)
@@ -61,6 +61,7 @@
 import java.util.Map;
 import java.util.TreeSet;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.parser.Parser;
 
@@ -76,7 +77,7 @@
     private StringBuffer description;   // an abstract, if present: short 
content description
     private Object text;            // the clear text, all that is visible
     private Map<yacyURL, String> anchors;    // all links embedded as 
clickeable entities (anchor tags)
-    private TreeSet<htmlFilterImageEntry> images;         // all visible 
pictures in document
+    private HashMap<String, htmlFilterImageEntry> images;         // all 
visible pictures in document
     // the anchors and images - Maps are URL-to-EntityDescription mappings.
     // The EntityDescription appear either as visible text in anchors or as 
alternative
     // text in image tags.
@@ -89,7 +90,7 @@
     protected plasmaParserDocument(yacyURL location, String mimeType, String 
charset,
                     String[] keywords, String title, String author,
                     String[] sections, String abstrct,
-                    Object text, Map<yacyURL, String> anchors, 
TreeSet<htmlFilterImageEntry> images) {
+                    Object text, Map<yacyURL, String> anchors, HashMap<String, 
htmlFilterImageEntry> images) {
         this.source = location;
         this.mimeType = (mimeType == null) ? "application/octet-stream" : 
mimeType;
         this.charset = charset;
@@ -99,7 +100,7 @@
         this.sections = (sections == null) ? new LinkedList<String>() : 
Arrays.asList(sections);
         this.description = (abstrct == null) ? new StringBuffer() : new 
StringBuffer(abstrct);
         this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : 
anchors;
-        this.images =  (images == null) ? new TreeSet<htmlFilterImageEntry>() 
: images;
+        this.images =  (images == null) ? new HashMap<String, 
htmlFilterImageEntry>() : images;
         this.hyperlinks = null;
         this.audiolinks = null;
         this.videolinks = null;
@@ -124,21 +125,21 @@
     public plasmaParserDocument(yacyURL location, String mimeType, String 
charset,
                     String[] keywords, String title, String author,
                     String[] sections, String abstrct,
-                    byte[] text, Map<yacyURL, String> anchors, 
TreeSet<htmlFilterImageEntry> images) {
+                    byte[] text, Map<yacyURL, String> anchors, HashMap<String, 
htmlFilterImageEntry> images) {
         this(location, mimeType, charset, keywords, title, author, sections, 
abstrct, (Object)text, anchors, images);
     }
     
     public plasmaParserDocument(yacyURL location, String mimeType, String 
charset,
             String[] keywords, String title, String author,
             String[] sections, String abstrct,
-            File text, Map<yacyURL, String> anchors, 
TreeSet<htmlFilterImageEntry> images) {
+            File text, Map<yacyURL, String> anchors, HashMap<String, 
htmlFilterImageEntry> images) {
         this(location, mimeType, charset, keywords, title, author, sections, 
abstrct, (Object)text, anchors, images);
     }
     
     public plasmaParserDocument(yacyURL location, String mimeType, String 
charset,
             String[] keywords, String title, String author,
             String[] sections, String abstrct,
-            serverCachedFileOutputStream text, Map<yacyURL, String> anchors, 
TreeSet<htmlFilterImageEntry> images) {
+            serverCachedFileOutputStream text, Map<yacyURL, String> anchors, 
HashMap<String, htmlFilterImageEntry> images) {
         this(location, mimeType, charset, keywords, title, author, sections, 
abstrct, (Object)text, anchors, images);
     }
 
@@ -310,7 +311,7 @@
         return this.videolinks;
     }
     
-    public TreeSet<htmlFilterImageEntry> getImages() {
+    public HashMap<String, htmlFilterImageEntry> getImages() {
         // returns all links enbedded as pictures (visible in document)
         // this resturns a htmlFilterImageEntry collection
         if (!resorted) resortLinks();
@@ -341,7 +342,7 @@
         audiolinks = new HashMap<yacyURL, String>();
         applinks   = new HashMap<yacyURL, String>();
         emaillinks = new HashMap<String, String>();
-        TreeSet<htmlFilterImageEntry> collectedImages = new 
TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and 
joined later to the imagelinks
+        HashMap<String, htmlFilterImageEntry> collectedImages = new 
HashMap<String, htmlFilterImageEntry>(); // this is a set that is collected now 
and joined later to the imagelinks
         Map.Entry<yacyURL, String> entry;
         while (i.hasNext()) {
             entry = i.next();
@@ -361,7 +362,7 @@
                     if (plasmaParser.mediaExtContains(ext)) {
                         // this is not a normal anchor, its a media link
                         if (plasmaParser.imageExtContains(ext)) {
-                            collectedImages.add(new htmlFilterImageEntry(url, 
(String) entry.getValue(), -1, -1));
+                            htmlFilterContentScraper.addImage(collectedImages, 
new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
                         }
                         else if (plasmaParser.audioExtContains(ext)) 
audiolinks.put(url, (String)entry.getValue());
                         else if (plasmaParser.videoExtContains(ext)) 
videolinks.put(url, (String)entry.getValue());
@@ -374,23 +375,18 @@
         }
         
         // add image links that we collected from the anchors to the image map
-        Iterator<htmlFilterImageEntry>  j = collectedImages.iterator();
-        htmlFilterImageEntry iEntry;
-        while (j.hasNext()) {
-            iEntry = (htmlFilterImageEntry) j.next();
-            if (!images.contains(iEntry)) images.add(iEntry);
-        }
+        htmlFilterContentScraper.addAllImages(images, collectedImages);
        
         // expand the hyperlinks:
         // we add artificial hyperlinks to the hyperlink set
         // that can be calculated from given hyperlinks and imagelinks
         
-        hyperlinks.putAll(plasmaParser.allReflinks(images));
+        hyperlinks.putAll(plasmaParser.allReflinks(images.values()));
         hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
         hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
-        hyperlinks.putAll(plasmaParser.allSubpaths(images));
+        hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
         hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
@@ -417,7 +413,7 @@
         serverFileUtils.copy(doc.getText(), 
(serverCachedFileOutputStream)this.text);
         
         anchors.putAll(doc.getAnchors());
-        images.addAll(doc.getImages());
+        htmlFilterContentScraper.addAllImages(images, doc.getImages());
     }
     
     /**

Modified: trunk/source/de/anomic/plasma/plasmaSearchImages.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSearchImages.java       2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaSearchImages.java       2008-02-25 
14:08:15 UTC (rev 4507)
@@ -43,9 +43,10 @@
 
 import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.util.HashMap;
 import java.util.Iterator;
-import java.util.TreeSet;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.server.serverDate;
@@ -53,11 +54,11 @@
 
 public final class plasmaSearchImages {
 
-    private TreeSet<htmlFilterImageEntry> images;
+    private HashMap<String, htmlFilterImageEntry> images;
     
     public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
         long start = System.currentTimeMillis();
-        this.images = new TreeSet<htmlFilterImageEntry>();
+        this.images = new HashMap<String, htmlFilterImageEntry>();
         if (maxTime > 10) {
             Object[] resource = plasmaSnippetCache.getResource(url, true, 
(int) maxTime, false);
             InputStream res = (InputStream) resource[0];
@@ -75,7 +76,7 @@
                 if (document == null) return;
                 
                 // add the image links
-                this.addAll(document.getImages());
+                htmlFilterContentScraper.addAllImages(this.images, 
document.getImages());
 
                 // add also links from pages one step deeper, if depth > 0
                 if (depth > 0) {
@@ -97,26 +98,13 @@
     
     public void addAll(plasmaSearchImages m) {
         synchronized (m.images) {
-            addAll(m.images);
+            htmlFilterContentScraper.addAllImages(this.images, m.images);
         }
     }
     
-    private void addAll(TreeSet<htmlFilterImageEntry> ts) {
-        Iterator<htmlFilterImageEntry> i = ts.iterator();
-        htmlFilterImageEntry ie;
-        while (i.hasNext()) {
-            ie = i.next();
-            if (images.contains(ie)) {
-                if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
-            } else {
-                images.add(ie);
-            }
-        }
-    }
-    
     public Iterator<htmlFilterImageEntry> entries() {
         // returns htmlFilterImageEntry - Objects
-        return images.iterator();
+        return images.values().iterator();
     }
     
 }

Modified: trunk/source/de/anomic/plasma/plasmaSnippetCache.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSnippetCache.java       2008-02-24 
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaSnippetCache.java       2008-02-25 
14:08:15 UTC (rev 4507)
@@ -697,7 +697,8 @@
     
     public static ArrayList<MediaSnippet> 
computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
         
-        TreeSet<htmlFilterImageEntry> images = document.getImages(); // 
iterates images in descending size order!
+        TreeSet<htmlFilterImageEntry> images = new 
TreeSet<htmlFilterImageEntry>();
+        images.addAll(document.getImages().values()); // iterates images in 
descending size order!
         // a measurement for the size of the images can be retrieved using the 
htmlFilterImageEntry.hashCode()
         
         Iterator<htmlFilterImageEntry> i = images.iterator();

_______________________________________________
YaCy-svn mailing list
[email protected]
https://lists.berlios.de/mailman/listinfo/yacy-svn

[YaCy-svn] r4507 - in trunk: htroot source/de/anomic/htmlFilter source/de/anomic/http source/de/anomic/plasma source/de/anomic/plasma/parser/rss source/de/anomic/plasma/parser/tar source/de/anomic/plasma/parser/zip

Antwort per Email an