Author: orbiter
Date: 2008-02-25 15:08:15 +0100 (Mon, 25 Feb 2008)
New Revision: 4507
Modified:
trunk/htroot/CacheAdmin_p.java
trunk/htroot/ViewFile.java
trunk/htroot/ViewImage.java
trunk/htroot/yacysearchitem.html
trunk/htroot/yacysearchitem.java
trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
trunk/source/de/anomic/http/httpdFileHandler.java
trunk/source/de/anomic/plasma/parser/rss/rssParser.java
trunk/source/de/anomic/plasma/parser/tar/tarParser.java
trunk/source/de/anomic/plasma/parser/zip/zipParser.java
trunk/source/de/anomic/plasma/plasmaCondenser.java
trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java
trunk/source/de/anomic/plasma/plasmaCrawlNURL.java
trunk/source/de/anomic/plasma/plasmaParser.java
trunk/source/de/anomic/plasma/plasmaParserDocument.java
trunk/source/de/anomic/plasma/plasmaSearchImages.java
trunk/source/de/anomic/plasma/plasmaSnippetCache.java
Log:
- enhanced recognition, parsing, management and double-occurrence-handling of
image tags
- enhanced text parser (condenser): found and eliminated bad code parts;
increase of speed
- added handling of image preview using the image cache from HTCACHE
- some other minor changes
Modified: trunk/htroot/CacheAdmin_p.java
===================================================================
--- trunk/htroot/CacheAdmin_p.java 2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/CacheAdmin_p.java 2008-02-25 14:08:15 UTC (rev 4507)
@@ -54,6 +54,7 @@
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.Writer;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@@ -313,8 +314,8 @@
prop.put("info_type_use." + extension, (i == 0) ? 0 : 1);
}
- private static void formatImageAnchor(serverObjects prop,
TreeSet<htmlFilterImageEntry> anchor) {
- final Iterator<htmlFilterImageEntry> iter = anchor.iterator();
+ private static void formatImageAnchor(serverObjects prop, HashMap<String,
htmlFilterImageEntry> anchor) {
+ final Iterator<htmlFilterImageEntry> iter = anchor.values().iterator();
htmlFilterImageEntry ie;
prop.put("info_type_use.images_images", anchor.size());
int i = 0;
Modified: trunk/htroot/ViewFile.java
===================================================================
--- trunk/htroot/ViewFile.java 2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/ViewFile.java 2008-02-25 14:08:15 UTC (rev 4507)
@@ -49,9 +49,9 @@
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
-import java.util.TreeSet;
import de.anomic.data.htmlTools;
import de.anomic.htmlFilter.htmlFilterImageEntry;
@@ -339,8 +339,8 @@
i += putMediaInfo(prop, wordArray, i,
document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
- TreeSet<htmlFilterImageEntry> ts = document.getImages();
- Iterator<htmlFilterImageEntry> tsi = ts.iterator();
+ HashMap<String, htmlFilterImageEntry> ts =
document.getImages();
+ Iterator<htmlFilterImageEntry> tsi = ts.values().iterator();
htmlFilterImageEntry entry;
while (tsi.hasNext()) {
entry = tsi.next();
Modified: trunk/htroot/ViewImage.java
===================================================================
--- trunk/htroot/ViewImage.java 2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/ViewImage.java 2008-02-25 14:08:15 UTC (rev 4507)
@@ -135,47 +135,47 @@
// find original size
int h = image.getHeight(null);
int w = image.getWidth(null);
-
- // System.out.println("DEBUG: get access to image " +
- // url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT
- // authorized"));
-
+
// in case of not-authorized access shrink the image to prevent
- // copyright problems
- // so that images are not larger than thumbnails
- if ((!auth) && ((w > 16) || (h > 16))) {
+ // copyright problems, so that images are not larger than
thumbnails
+ if (auth) {
+ maxwidth = (maxwidth == 0) ? w : maxwidth;
+ maxheight = (maxheight == 0) ? h : maxheight;
+ } else if ((w > 16) || (h > 16)) {
maxwidth = (int) Math.min(64.0, w * 0.6);
maxheight = (int) Math.min(64.0, h * 0.6);
+ } else {
+ maxwidth = 16;
+ maxheight = 16;
}
// calculate width & height from maxwidth & maxheight
- if ((maxwidth != 0) || (maxheight != 0)) {
+ if ((maxwidth < w) || (maxheight < h)) {
+ // scale image
double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) /
((double) w);
double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) /
((double) h);
double scale = Math.min(hs, vs);
if (!auth) scale = Math.min(scale, 0.6); // this is for
copyright purpose
if (scale < 1.0) {
- width = (int) (w * scale);
- height = (int) (h * scale);
+ width = Math.max(1, (int) (w * scale));
+ height = Math.max(1, (int) (h * scale));
} else {
- width = w;
- height = h;
+ width = Math.max(1, w);
+ height = Math.max(1, h);
}
+
+ // compute scaled image
+ scaled = ((w == width) && (h == height)) ? image :
image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
+ MediaTracker mediaTracker = new MediaTracker(new Container());
+ mediaTracker.addImage(scaled, 0);
+ try {mediaTracker.waitForID(0);} catch (InterruptedException
e) {}
} else {
+ // do not scale
width = w;
height = h;
+ scaled = image;
}
- // check for minimum values
- width = Math.max(width, 1);
- height = Math.max(height, 1);
-
- // scale image
- scaled = ((w == width) && (h == height)) ? image :
image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
- MediaTracker mediaTracker = new MediaTracker(new Container());
- mediaTracker.addImage(scaled, 0);
- try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
-
if ((height == 16) && (width == 16) && (resource != null)) {
// this might be a favicon, store image to cache for faster
re-load later on
iconcache.put(urlString, scaled);
Modified: trunk/htroot/yacysearchitem.html
===================================================================
--- trunk/htroot/yacysearchitem.html 2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/yacysearchitem.html 2008-02-25 14:08:15 UTC (rev 4507)
@@ -22,7 +22,7 @@
::
#{items}#
<div class="thumbcontainer">
- <a href="#[href]#" class="thumblink" onclick="return hs.expand(this)">
+ <a href="#[hrefCache]#" class="thumblink" onclick="return hs.expand(this)">
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#"
alt="#[name]#">
</a>
<div class="highslide-caption"><a href="#[href]#">#[name]#<br \><a
href="#[source]#">#[sourcedom]#</a></a></div>
Modified: trunk/htroot/yacysearchitem.java
===================================================================
--- trunk/htroot/yacysearchitem.java 2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/htroot/yacysearchitem.java 2008-02-25 14:08:15 UTC (rev 4507)
@@ -67,6 +67,7 @@
boolean rss = post.get("rss", "false").equals("true");
boolean authenticated = sb.adminAuthenticated(header) >= 2;
int item = post.getInt("item", -1);
+ boolean auth = ((String)
header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") ||
sb.verifyAuthentication(header, true);
// default settings for blank item
prop.put("content", "0");
@@ -233,6 +234,7 @@
if (ms == null) {
prop.put("content_items", "0");
} else {
+ prop.putHTML("content_items_0_hrefCache", (auth) ?
"/ViewImage.png?url=" + ms.href.toNormalform(true, false) :
ms.href.toNormalform(true, false));
prop.putHTML("content_items_0_href",
ms.href.toNormalform(true, false));
prop.put("content_items_0_code",
sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_0_name", shorten(ms.name,
namelength));
Modified: trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
===================================================================
--- trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
2008-02-24 21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
2008-02-25 14:08:15 UTC (rev 4507)
@@ -54,6 +54,7 @@
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@@ -102,7 +103,7 @@
// class variables: collectors for links
private HashMap<yacyURL, String> anchors;
- private TreeSet<htmlFilterImageEntry> images; // String(absolute
url)/ImageEntry relation
+ private HashMap<String, htmlFilterImageEntry> images; // urlhash/image
relation
private HashMap<String, String> metas;
private String title;
//private String headline;
@@ -127,7 +128,7 @@
super(linkTags0, linkTags1);
this.root = root;
this.anchors = new HashMap<yacyURL, String>();
- this.images = new TreeSet<htmlFilterImageEntry>();
+ this.images = new HashMap<String, htmlFilterImageEntry>();
this.metas = new HashMap<String, String>();
this.title = "";
this.headlines = new ArrayList[4];
@@ -178,7 +179,7 @@
} catch (NumberFormatException e) {}
yacyURL url = absolutePath(tagopts.getProperty("src", ""));
htmlFilterImageEntry ie = new htmlFilterImageEntry(url,
tagopts.getProperty("alt",""), width, height);
- images.add(ie);
+ addImage(images, ie);
}
if (tagname.equalsIgnoreCase("base")) try {
root = new yacyURL(tagopts.getProperty("href", ""), null);
@@ -212,7 +213,7 @@
if (type.equalsIgnoreCase("shortcut icon")) {
htmlFilterImageEntry ie = new
htmlFilterImageEntry(newLink, linktitle, -1,-1);
- images.add(ie);
+ images.put(ie.url().hash(), ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") &&
!type.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink, linktitle);
@@ -234,12 +235,24 @@
// fire event
fireScrapeTag0(tagname, tagopts);
}
-
+
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" +
tagopts.toString() + ", text=" + new String(text));
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
String href = tagopts.getProperty("href", "");
- if (href.length() > 0) anchors.put(absolutePath(href),
super.stripAll(new serverCharBuffer(text)).trim().toString());
+ if (href.length() > 0) {
+ yacyURL url = absolutePath(href);
+ String f = url.getFile();
+ int p = f.lastIndexOf('.');
+ String type = (p < 0) ? "" : f.substring(p + 1);
+ if (type.equals("png") || type.equals("gif") ||
type.equals("jpg") || type.equals("jpeg")) {
+ // special handling of such urls: put them to the image
urls
+ htmlFilterImageEntry ie = new htmlFilterImageEntry(url,
super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
+ addImage(images, ie);
+ } else {
+ anchors.put(url, super.stripAll(new
serverCharBuffer(text)).trim().toString());
+ }
+ }
}
String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@@ -348,7 +361,7 @@
return anchors;
}
- public TreeSet<htmlFilterImageEntry> getImages() {
+ public HashMap<String, htmlFilterImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
}
@@ -522,5 +535,24 @@
return scraper;
}
+
+ public static void addAllImages(HashMap<String, htmlFilterImageEntry> a,
HashMap<String, htmlFilterImageEntry> b) {
+ Iterator<Map.Entry<String, htmlFilterImageEntry>> i =
b.entrySet().iterator();
+ Map.Entry<String, htmlFilterImageEntry> ie;
+ while (i.hasNext()) {
+ ie = i.next();
+ addImage(a, ie.getValue());
+ }
+ }
+
+ public static void addImage(HashMap<String, htmlFilterImageEntry> a,
htmlFilterImageEntry ie) {
+ if (a.containsKey(ie.url().hash())) {
+ // in case of a collision, take that image that has the better
image size tags
+ if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(),
ie);
+ } else {
+ a.put(ie.url().hash(), ie);
+ }
+ }
+
}
Modified: trunk/source/de/anomic/http/httpdFileHandler.java
===================================================================
--- trunk/source/de/anomic/http/httpdFileHandler.java 2008-02-24 21:18:04 UTC
(rev 4506)
+++ trunk/source/de/anomic/http/httpdFileHandler.java 2008-02-25 14:08:15 UTC
(rev 4507)
@@ -452,6 +452,7 @@
sb.append("<html>\n<head>\n</head>\n<body>\n<h1>Index of "
+ path + "</h1>\n <ul>\n");
File dir = new File(htDocsPath, path);
String[] list = dir.list();
+ if (list == null) list = new String[0]; // should not
occur!
File f;
String size;
long sz;
Modified: trunk/source/de/anomic/plasma/parser/rss/rssParser.java
===================================================================
--- trunk/source/de/anomic/plasma/parser/rss/rssParser.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/parser/rss/rssParser.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -50,7 +50,6 @@
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
-import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@@ -97,7 +96,7 @@
try {
LinkedList<String> feedSections = new LinkedList<String>();
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
- TreeSet<htmlFilterImageEntry> images = new
TreeSet<htmlFilterImageEntry>();
+ HashMap<String, htmlFilterImageEntry> images = new
HashMap<String, htmlFilterImageEntry>();
serverByteBuffer text = new serverByteBuffer();
serverCharBuffer authors = new serverCharBuffer();
@@ -114,7 +113,8 @@
String feedDescription = reader.getChannel().getDescription();
if (reader.getImage() != null) {
- images.add(new htmlFilterImageEntry(new
yacyURL(reader.getImage(), null), feedTitle, -1, -1));
+ yacyURL imgURL = new yacyURL(reader.getImage(), null);
+ images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL,
feedTitle, -1, -1));
}
// loop through the feed items
@@ -154,9 +154,9 @@
anchors.putAll(itemLinks);
}
- TreeSet<htmlFilterImageEntry> itemImages =
scraper.getImages();
+ HashMap<String, htmlFilterImageEntry> itemImages =
scraper.getImages();
if ((itemImages != null) && (itemImages.size() > 0)) {
- images.addAll(itemImages);
+ htmlFilterContentScraper.addAllImages(images,
itemImages);
}
byte[] extractedText = scraper.getText();
Modified: trunk/source/de/anomic/plasma/parser/tar/tarParser.java
===================================================================
--- trunk/source/de/anomic/plasma/parser/tar/tarParser.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/parser/tar/tarParser.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -53,12 +53,12 @@
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
-import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import com.ice.tar.TarEntry;
import com.ice.tar.TarInputStream;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
@@ -132,7 +132,7 @@
StringBuffer docAbstrct = new StringBuffer();
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
- TreeSet<htmlFilterImageEntry> docImages = new
TreeSet<htmlFilterImageEntry>();
+ HashMap<String, htmlFilterImageEntry> docImages = new
HashMap<String, htmlFilterImageEntry>();
// looping through the contained files
TarEntry entry;
@@ -193,7 +193,7 @@
}
docAnchors.putAll(subDoc.getAnchors());
- docImages.addAll(subDoc.getImages());
+ htmlFilterContentScraper.addAllImages(docImages,
subDoc.getImages());
// release subdocument
subDoc.close();
Modified: trunk/source/de/anomic/plasma/parser/zip/zipParser.java
===================================================================
--- trunk/source/de/anomic/plasma/parser/zip/zipParser.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/parser/zip/zipParser.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -53,10 +53,10 @@
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
-import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
@@ -115,7 +115,7 @@
LinkedList<String> docSections = new LinkedList<String>();
StringBuffer docAbstrct = new StringBuffer();
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
- TreeSet<htmlFilterImageEntry> docImages = new
TreeSet<htmlFilterImageEntry>();
+ HashMap<String, htmlFilterImageEntry> docImages = new
HashMap<String, htmlFilterImageEntry>();
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
@@ -176,7 +176,7 @@
}
docAnchors.putAll(subDoc.getAnchors());
- docImages.addAll(subDoc.getImages());
+ htmlFilterContentScraper.addAllImages(docImages,
subDoc.getImages());
// release subdocument
subDoc.close();
Modified: trunk/source/de/anomic/plasma/plasmaCondenser.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCondenser.java 2008-02-24 21:18:04 UTC
(rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaCondenser.java 2008-02-25 14:08:15 UTC
(rev 4507)
@@ -107,6 +107,19 @@
private final static int numlength = 5;
+ // initialize array of invisible characters
+ private static boolean[] invisibleChar = new boolean['z' - ' ' + 1];
+ static {
+ // initialize array of invisible charachters
+ String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\";
+ for (int i = ' '; i <= 'z'; i++) {
+ invisibleChar[i - ' '] = false;
+ }
+ for (int i = 0; i < invisibleString.length(); i++) {
+ invisibleChar[invisibleString.charAt(i) - ' '] = true;
+ }
+ }
+
//private Properties analysis;
private TreeMap<String, wordStatProp> words; // a string (the words) to
(wordStatProp) - relation
private HashMap<StringBuffer, phraseStatProp> sentences;
@@ -198,7 +211,7 @@
}
// images
- Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
+ Iterator<htmlFilterImageEntry> j =
document.getImages().values().iterator();
htmlFilterImageEntry ientry;
while (j.hasNext()) {
ientry = j.next();
@@ -659,7 +672,7 @@
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
- return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
+ return invisibleChar[c - ' '];
}
public static Enumeration<StringBuffer> wordTokenizer(String s, String
charset, int minLength) {
@@ -727,7 +740,7 @@
public unsievedWordsEnum(InputStream is, String charset) throws
UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset);
- s = new StringBuffer();
+ s = new StringBuffer(20);
buffer = nextElement0();
}
@@ -859,9 +872,9 @@
}
static StringBuffer readSentence(Reader reader, boolean pre) throws
IOException {
- StringBuffer s = new StringBuffer();
+ StringBuffer s = new StringBuffer(20);
int nextChar;
- char c;
+ char c, lc = (char) 0;
// find sentence end
for (;;) {
@@ -871,20 +884,14 @@
if (s.length() == 0) return null; else break;
}
c = (char) nextChar;
+ if (pre && ((c == (char) 10) || (c == (char) 13))) break;
+ if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = '
';
+ if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces
s.append(c);
- if (pre) {
- if ((c == (char) 10) || (c == (char) 13)) break;
- } else {
- if (htmlFilterContentScraper.punctuation(c)) break;
- }
+ if (htmlFilterContentScraper.punctuation(c)) break;
+ lc = c;
}
-
- // replace line endings and tabs by blanks
- for (int i = 0; i < s.length(); i++) {
- if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) ||
(s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
- }
- // remove all double-spaces
- int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
+
return s;
}
Modified: trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaCrawlBalancer.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -130,7 +130,10 @@
}
public void finalize() {
- if (urlFileStack != null) close();
+ if (urlFileStack != null) {
+ serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " +
stackname + " closed by finalizer");
+ close();
+ }
}
public synchronized void clear() {
Modified: trunk/source/de/anomic/plasma/plasmaCrawlNURL.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlNURL.java 2008-02-24 21:18:04 UTC
(rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaCrawlNURL.java 2008-02-25 14:08:15 UTC
(rev 4507)
@@ -49,6 +49,8 @@
import java.util.HashSet;
import java.util.Iterator;
+import de.anomic.server.logging.serverLog;
+
public class plasmaCrawlNURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
@@ -64,9 +66,9 @@
private static final long minimumGlobalDelta = 500; // the minimum time
difference between access of the same global domain
private static final long maximumDomAge = 60000; // the maximum age of a
domain until it is used for another crawl attempt
- private final plasmaCrawlBalancer coreStack; // links found by
crawling to depth-1
- private final plasmaCrawlBalancer limitStack; // links found by
crawling at target depth
- private final plasmaCrawlBalancer remoteStack; // links from remote
crawl orders
+ private plasmaCrawlBalancer coreStack; // links found by crawling to
depth-1
+ private plasmaCrawlBalancer limitStack; // links found by crawling at
target depth
+ private plasmaCrawlBalancer remoteStack; // links from remote crawl
orders
//private final plasmaCrawlBalancer overhangStack; // links found by
crawling at depth+1
//private kelondroStack imageStack; // links pointing to image
resources
//private kelondroStack movieStack; // links pointing to movie
resources
@@ -81,12 +83,28 @@
}
public void close() {
- coreStack.close();
- limitStack.close();
+ if (coreStack != null) {
+ coreStack.close();
+ coreStack = null;
+ }
+ if (limitStack != null) {
+ limitStack.close();
+ limitStack = null;
+ }
//overhangStack.close();
- remoteStack.close();
+ if (remoteStack != null) {
+ remoteStack.close();
+ remoteStack = null;
+ }
}
+ public void finalize() {
+ if ((coreStack != null) || (limitStack != null) || (remoteStack !=
null)) {
+ serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by
finalizer");
+ close();
+ }
+ }
+
public boolean notEmpty() {
return coreStack.notEmpty() || limitStack.notEmpty() ||
remoteStack.notEmpty();
}
Modified: trunk/source/de/anomic/plasma/plasmaParser.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaParser.java 2008-02-24 21:18:04 UTC
(rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaParser.java 2008-02-25 14:08:15 UTC
(rev 4507)
@@ -39,6 +39,7 @@
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
+import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@@ -747,7 +748,7 @@
}
- static Map<yacyURL, String> allReflinks(Set<?> links) {
+ static Map<yacyURL, String> allReflinks(Collection<?> links) {
// links is either a Set of Strings (with urls) or
htmlFilterImageEntries
// we find all links that are part of a reference inside a url
HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
@@ -786,7 +787,7 @@
return v;
}
- static Map<yacyURL, String> allSubpaths(Set<?> links) {
+ static Map<yacyURL, String> allSubpaths(Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of
htmlFilterImageEntries
HashSet<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();
Modified: trunk/source/de/anomic/plasma/plasmaParserDocument.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaParserDocument.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaParserDocument.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -61,6 +61,7 @@
import java.util.Map;
import java.util.TreeSet;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.parser.Parser;
@@ -76,7 +77,7 @@
private StringBuffer description; // an abstract, if present: short
content description
private Object text; // the clear text, all that is visible
private Map<yacyURL, String> anchors; // all links embedded as
clickeable entities (anchor tags)
- private TreeSet<htmlFilterImageEntry> images; // all visible
pictures in document
+ private HashMap<String, htmlFilterImageEntry> images; // all
visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as
alternative
// text in image tags.
@@ -89,7 +90,7 @@
protected plasmaParserDocument(yacyURL location, String mimeType, String
charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
- Object text, Map<yacyURL, String> anchors,
TreeSet<htmlFilterImageEntry> images) {
+ Object text, Map<yacyURL, String> anchors, HashMap<String,
htmlFilterImageEntry> images) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" :
mimeType;
this.charset = charset;
@@ -99,7 +100,7 @@
this.sections = (sections == null) ? new LinkedList<String>() :
Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuffer() : new
StringBuffer(abstrct);
this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) :
anchors;
- this.images = (images == null) ? new TreeSet<htmlFilterImageEntry>()
: images;
+ this.images = (images == null) ? new HashMap<String,
htmlFilterImageEntry>() : images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
@@ -124,21 +125,21 @@
public plasmaParserDocument(yacyURL location, String mimeType, String
charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
- byte[] text, Map<yacyURL, String> anchors,
TreeSet<htmlFilterImageEntry> images) {
+ byte[] text, Map<yacyURL, String> anchors, HashMap<String,
htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections,
abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String
charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
- File text, Map<yacyURL, String> anchors,
TreeSet<htmlFilterImageEntry> images) {
+ File text, Map<yacyURL, String> anchors, HashMap<String,
htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections,
abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String
charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
- serverCachedFileOutputStream text, Map<yacyURL, String> anchors,
TreeSet<htmlFilterImageEntry> images) {
+ serverCachedFileOutputStream text, Map<yacyURL, String> anchors,
HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections,
abstrct, (Object)text, anchors, images);
}
@@ -310,7 +311,7 @@
return this.videolinks;
}
- public TreeSet<htmlFilterImageEntry> getImages() {
+ public HashMap<String, htmlFilterImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
@@ -341,7 +342,7 @@
audiolinks = new HashMap<yacyURL, String>();
applinks = new HashMap<yacyURL, String>();
emaillinks = new HashMap<String, String>();
- TreeSet<htmlFilterImageEntry> collectedImages = new
TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and
joined later to the imagelinks
+ HashMap<String, htmlFilterImageEntry> collectedImages = new
HashMap<String, htmlFilterImageEntry>(); // this is a set that is collected now
and joined later to the imagelinks
Map.Entry<yacyURL, String> entry;
while (i.hasNext()) {
entry = i.next();
@@ -361,7 +362,7 @@
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
- collectedImages.add(new htmlFilterImageEntry(url,
(String) entry.getValue(), -1, -1));
+ htmlFilterContentScraper.addImage(collectedImages,
new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
else if (plasmaParser.audioExtContains(ext))
audiolinks.put(url, (String)entry.getValue());
else if (plasmaParser.videoExtContains(ext))
videolinks.put(url, (String)entry.getValue());
@@ -374,23 +375,18 @@
}
// add image links that we collected from the anchors to the image map
- Iterator<htmlFilterImageEntry> j = collectedImages.iterator();
- htmlFilterImageEntry iEntry;
- while (j.hasNext()) {
- iEntry = (htmlFilterImageEntry) j.next();
- if (!images.contains(iEntry)) images.add(iEntry);
- }
+ htmlFilterContentScraper.addAllImages(images, collectedImages);
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
- hyperlinks.putAll(plasmaParser.allReflinks(images));
+ hyperlinks.putAll(plasmaParser.allReflinks(images.values()));
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
- hyperlinks.putAll(plasmaParser.allSubpaths(images));
+ hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
@@ -417,7 +413,7 @@
serverFileUtils.copy(doc.getText(),
(serverCachedFileOutputStream)this.text);
anchors.putAll(doc.getAnchors());
- images.addAll(doc.getImages());
+ htmlFilterContentScraper.addAllImages(images, doc.getImages());
}
/**
Modified: trunk/source/de/anomic/plasma/plasmaSearchImages.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSearchImages.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaSearchImages.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -43,9 +43,10 @@
import java.io.InputStream;
import java.net.MalformedURLException;
+import java.util.HashMap;
import java.util.Iterator;
-import java.util.TreeSet;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverDate;
@@ -53,11 +54,11 @@
public final class plasmaSearchImages {
- private TreeSet<htmlFilterImageEntry> images;
+ private HashMap<String, htmlFilterImageEntry> images;
public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
long start = System.currentTimeMillis();
- this.images = new TreeSet<htmlFilterImageEntry>();
+ this.images = new HashMap<String, htmlFilterImageEntry>();
if (maxTime > 10) {
Object[] resource = plasmaSnippetCache.getResource(url, true,
(int) maxTime, false);
InputStream res = (InputStream) resource[0];
@@ -75,7 +76,7 @@
if (document == null) return;
// add the image links
- this.addAll(document.getImages());
+ htmlFilterContentScraper.addAllImages(this.images,
document.getImages());
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {
@@ -97,26 +98,13 @@
public void addAll(plasmaSearchImages m) {
synchronized (m.images) {
- addAll(m.images);
+ htmlFilterContentScraper.addAllImages(this.images, m.images);
}
}
- private void addAll(TreeSet<htmlFilterImageEntry> ts) {
- Iterator<htmlFilterImageEntry> i = ts.iterator();
- htmlFilterImageEntry ie;
- while (i.hasNext()) {
- ie = i.next();
- if (images.contains(ie)) {
- if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
- } else {
- images.add(ie);
- }
- }
- }
-
public Iterator<htmlFilterImageEntry> entries() {
// returns htmlFilterImageEntry - Objects
- return images.iterator();
+ return images.values().iterator();
}
}
Modified: trunk/source/de/anomic/plasma/plasmaSnippetCache.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSnippetCache.java 2008-02-24
21:18:04 UTC (rev 4506)
+++ trunk/source/de/anomic/plasma/plasmaSnippetCache.java 2008-02-25
14:08:15 UTC (rev 4507)
@@ -697,7 +697,8 @@
public static ArrayList<MediaSnippet>
computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
- TreeSet<htmlFilterImageEntry> images = document.getImages(); //
iterates images in descending size order!
+ TreeSet<htmlFilterImageEntry> images = new
TreeSet<htmlFilterImageEntry>();
+ images.addAll(document.getImages().values()); // iterates images in
descending size order!
// a measurement for the size of the images can be retrieved using the
htmlFilterImageEntry.hashCode()
Iterator<htmlFilterImageEntry> i = images.iterator();
_______________________________________________
YaCy-svn mailing list
[email protected]
https://lists.berlios.de/mailman/listinfo/yacy-svn