Author: orbiter
Date: 2008-02-21 15:53:51 +0100 (Thu, 21 Feb 2008)
New Revision: 4499
Modified:
trunk/htroot/yacy/user/ysearchitem.html
trunk/htroot/yacy/user/ysearchitem.java
trunk/htroot/yacysearchitem.html
trunk/htroot/yacysearchitem.java
trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java
trunk/source/de/anomic/plasma/plasmaSearchEvent.java
trunk/source/de/anomic/plasma/plasmaSnippetCache.java
Log:
- added image sorting by image size. This is the default now.
This is performed using a 3-stage sorting process:
- sort by relevance, then do snippet-fetch
- sort snippets by relevance then do image link extraction
- sort image links by image size; unknown sizes are handled like small sizes
- only the exact amount of images as requested are shown
Modified: trunk/htroot/yacy/user/ysearchitem.html
===================================================================
--- trunk/htroot/yacy/user/ysearchitem.html 2008-02-21 10:06:57 UTC (rev
4498)
+++ trunk/htroot/yacy/user/ysearchitem.html 2008-02-21 14:53:51 UTC (rev
4499)
@@ -13,7 +13,7 @@
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#"
alt="#[name]#">
</a>
<div class="highslide-caption"><a href="#[href]#">#[name]#</a></div>
- <div class="snippet"><a href="#[href]#">#[name]#</a></div>
+ <div class="snippet"><a href="#[href]#">#[name]##[attr]#</a></div>
</div>
#{/items}#
::
Modified: trunk/htroot/yacy/user/ysearchitem.java
===================================================================
--- trunk/htroot/yacy/user/ysearchitem.java 2008-02-21 10:06:57 UTC (rev
4498)
+++ trunk/htroot/yacy/user/ysearchitem.java 2008-02-21 14:53:51 UTC (rev
4499)
@@ -164,35 +164,32 @@
return prop;
}
-
- // generate result object
- plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
-
- if (result == null) {
- // no content
- return prop;
- }
-
- if (rss) {
- // text search for rss output
- prop.put("rss", "1"); // switch on specific content
- prop.putHTML("rss_title", result.title(), true);
- prop.putHTML("rss_description", result.textSnippet().getLineRaw(),
true);
- prop.putHTML("rss_link", result.urlstring(), true);
- prop.put("rss_urlhash", result.hash());
- prop.put("rss_date",
plasmaSwitchboard.dateString822(result.modified()));
- return prop;
- }
-
+
prop.put("rss", "0");
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// text search
+
+ // generate result object
+ plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+ if (result == null) return prop; // no content
+
+ if (rss) {
+ // text search for rss output
+ prop.put("rss", "1"); // switch on specific content
+ prop.putHTML("rss_title", result.title(), true);
+ prop.putHTML("rss_description",
result.textSnippet().getLineRaw(), true);
+ prop.putHTML("rss_link", result.urlstring(), true);
+ prop.put("rss_urlhash", result.hash());
+ prop.put("rss_date",
plasmaSwitchboard.dateString822(result.modified()));
+ return prop;
+ }
+
prop.put("content", theQuery.contentdom + 1); // switch on
specific content
prop.put("content_authorized", authenticated ? "1" : "0");
prop.put("content_authorized_recommend",
(yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB,
yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" :
"0");
- prop.put("content_authorized_recommend_deletelink",
"/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" +
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() -
theQuery.displayResults()) + "&order=" +
crypt.simpleEncode(theQuery.ranking.toExternalString()) +
"&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
- prop.put("content_authorized_recommend_recommendlink",
"/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" +
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() -
theQuery.displayResults()) + "&order=" +
crypt.simpleEncode(theQuery.ranking.toExternalString()) +
"&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
+ prop.put("content_authorized_recommend_deletelink",
"/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" +
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() -
theQuery.displayResults()) + "&order=" +
crypt.simpleEncode(theQuery.ranking.toExternalString()) +
"&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
+ prop.put("content_authorized_recommend_recommendlink",
"/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" +
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() -
theQuery.displayResults()) + "&order=" +
crypt.simpleEncode(theQuery.ranking.toExternalString()) +
"&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", result.hash());
prop.putHTML("content_description", result.title());
prop.put("content_url", result.urlstring());
@@ -229,23 +226,17 @@
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) {
// image search; shows thumbnails
- // iterate over all images in the result
+
prop.put("content", theQuery.contentdom + 1); // switch on
specific content
- ArrayList<plasmaSnippetCache.MediaSnippet> images =
result.mediaSnippets();
- if (images != null) {
- plasmaSnippetCache.MediaSnippet ms;
- int c = 0;
- for (int i = 0; i < images.size(); i++) {
- ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
- prop.putHTML("content_items_" + i + "_href",
ms.href.toNormalform(true, false));
- prop.put("content_items_" + i + "_code",
sb.licensedURLs.aquireLicense(ms.href));
- prop.putHTML("content_items_" + i + "_name",
shorten(ms.name, namelength));
- prop.put("content_items_" + i + "_attr", ms.attr); //
attributes, here: original size of image
- c++;
- }
- prop.put("content_items", c);
- } else {
+ plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item);
+ if (ms == null) {
prop.put("content_items", "0");
+ } else {
+ prop.putHTML("content_items_0_href",
ms.href.toNormalform(true, false));
+ prop.put("content_items_0_code",
sb.licensedURLs.aquireLicense(ms.href));
+ prop.putHTML("content_items_0_name", shorten(ms.name,
namelength));
+ prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ?
"" : " (" + ms.attr + ")"); // attributes, here: original size of image
+ prop.put("content_items", 1);
}
return prop;
}
@@ -254,6 +245,11 @@
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ||
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) {
// any other media content
+
+ // generate result object
+ plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+ if (result == null) return prop; // no content
+
prop.put("content", theQuery.contentdom + 1); // switch on
specific content
ArrayList<plasmaSnippetCache.MediaSnippet> media =
result.mediaSnippets();
if (item == 0) col = true;
Modified: trunk/htroot/yacysearchitem.html
===================================================================
--- trunk/htroot/yacysearchitem.html 2008-02-21 10:06:57 UTC (rev 4498)
+++ trunk/htroot/yacysearchitem.html 2008-02-21 14:53:51 UTC (rev 4499)
@@ -26,7 +26,7 @@
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#"
alt="#[name]#">
</a>
<div class="highslide-caption"><a href="#[href]#">#[name]#</a></div>
- <div class="TableCellDark"><a href="#[href]#">#[name]#</a></div>
+ <div class="TableCellDark"><a href="#[href]#">#[name]##[attr]#</a></div>
</div>
#{/items}#
::
Modified: trunk/htroot/yacysearchitem.java
===================================================================
--- trunk/htroot/yacysearchitem.java 2008-02-21 10:06:57 UTC (rev 4498)
+++ trunk/htroot/yacysearchitem.java 2008-02-21 14:53:51 UTC (rev 4499)
@@ -166,29 +166,26 @@
return prop;
}
- // generate result object
- plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
-
- if (result == null) {
- // no content
- return prop;
- }
-
- if (rss) {
- // text search for rss output
- prop.put("rss", "1"); // switch on specific content
- prop.putHTML("rss_title", result.title(), true);
- prop.putHTML("rss_description", result.textSnippet().getLineRaw(),
true);
- prop.putHTML("rss_link", result.urlstring(), true);
- prop.put("rss_urlhash", result.hash());
- prop.put("rss_date",
plasmaSwitchboard.dateString822(result.modified()));
- return prop;
- }
-
prop.put("rss", "0");
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// text search
+
+ // generate result object
+ plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+ if (result == null) return prop; // no content
+
+ if (rss) {
+ // text search for rss output
+ prop.put("rss", "1"); // switch on specific content
+ prop.putHTML("rss_title", result.title(), true);
+ prop.putHTML("rss_description",
result.textSnippet().getLineRaw(), true);
+ prop.putHTML("rss_link", result.urlstring(), true);
+ prop.put("rss_urlhash", result.hash());
+ prop.put("rss_date",
plasmaSwitchboard.dateString822(result.modified()));
+ return prop;
+ }
+
prop.put("content", theQuery.contentdom + 1); // switch on
specific content
prop.put("content_authorized", authenticated ? "1" : "0");
prop.put("content_authorized_recommend",
(yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB,
yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" :
"0");
@@ -230,23 +227,17 @@
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) {
// image search; shows thumbnails
- // iterate over all images in the result
+
prop.put("content", theQuery.contentdom + 1); // switch on
specific content
- ArrayList<plasmaSnippetCache.MediaSnippet> images =
result.mediaSnippets();
- if (images != null) {
- plasmaSnippetCache.MediaSnippet ms;
- int c = 0;
- for (int i = 0; i < images.size(); i++) {
- ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
- prop.putHTML("content_items_" + i + "_href",
ms.href.toNormalform(true, false));
- prop.put("content_items_" + i + "_code",
sb.licensedURLs.aquireLicense(ms.href));
- prop.putHTML("content_items_" + i + "_name",
shorten(ms.name, namelength));
- prop.put("content_items_" + i + "_attr", ms.attr); //
attributes, here: original size of image
- c++;
- }
- prop.put("content_items", c);
- } else {
+ plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item);
+ if (ms == null) {
prop.put("content_items", "0");
+ } else {
+ prop.putHTML("content_items_0_href",
ms.href.toNormalform(true, false));
+ prop.put("content_items_0_code",
sb.licensedURLs.aquireLicense(ms.href));
+ prop.putHTML("content_items_0_name", shorten(ms.name,
namelength));
+ prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ?
"" : " (" + ms.attr + ")"); // attributes, here: original size of image
+ prop.put("content_items", 1);
}
return prop;
}
@@ -255,6 +246,11 @@
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ||
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) {
// any other media content
+
+ // generate result object
+ plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+ if (result == null) return prop; // no content
+
prop.put("content", theQuery.contentdom + 1); // switch on
specific content
ArrayList<plasmaSnippetCache.MediaSnippet> media =
result.mediaSnippets();
if (item == 0) col = true;
Modified: trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java
===================================================================
--- trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java 2008-02-21
10:06:57 UTC (rev 4498)
+++ trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java 2008-02-21
14:53:51 UTC (rev 4499)
@@ -80,10 +80,10 @@
// this hash method therefore tries to compute a 'perfect hash' based
on the size of the images
// unfortunately it can not be ensured that all images get different
hashes, but this should appear
// only in very rare cases
- if ((width > 0) && (height > 0))
- return ((0xFFFF - (((width * height) >> 8) & 0xFFFF)) << 16) |
(url.hashCode() & 0xFFFF);
+ if ((width >= 0) && (height >= 0))
+ return ((0x7FFF - (((width * height) >> 9) & 0x7FFF)) << 16) |
(url.hashCode() & 0xFFFF);
else
- return 0xFFFF0000 | (url.hashCode() & 0xFFFF);
+ return 0x7FFF0000 | (url.hashCode() & 0xFFFF);
}
public int compareTo(htmlFilterImageEntry h) {
Modified: trunk/source/de/anomic/plasma/plasmaSearchEvent.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSearchEvent.java 2008-02-21
10:06:57 UTC (rev 4498)
+++ trunk/source/de/anomic/plasma/plasmaSearchEvent.java 2008-02-21
14:53:51 UTC (rev 4499)
@@ -81,6 +81,7 @@
public String IAmaxcounthash, IAneardhthash;
private resultWorker[] workerThreads;
private kelondroSortStore<ResultEntry> result;
+ private kelondroSortStore<plasmaSnippetCache.MediaSnippet> images; //
container to sort images by size
private HashMap<String, String> failedURLs; // a mapping from a urlhash to
a fail reason string
TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are
used to match with the snippets
private long urlRetrievalAllTime;
@@ -107,6 +108,7 @@
this.workerThreads = null;
this.localSearchThread = null;
this.result = new kelondroSortStore<ResultEntry>(-1); // this is the
result, enriched with snippets, ranked and ordered by ranking
+ this.images = new
kelondroSortStore<plasmaSnippetCache.MediaSnippet>(-1);
this.failedURLs = new HashMap<String, String>(); // a map of urls to
reason strings where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes,
@@ -465,7 +467,8 @@
// if worker threads had been alive, but did not succeed, start
them again to fetch missing links
if ((query.onlineSnippetFetch) &&
(!event.anyWorkerAlive()) &&
- (event.result.size() < query.neededResults() + 10) &&
+ (((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) &&
(event.images.size() + 30 < query.neededResults())) ||
+ (event.result.size() < query.neededResults() + 10)) &&
(event.getRankingResult().getLocalResourceSize() +
event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
// set new timeout
event.eventTime = System.currentTimeMillis();
@@ -507,7 +510,9 @@
while (System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis();
- if (result.size() >= query.neededResults() /*+
query.displayResults()*/) break; // we have enough
+ // check if we have enough
+ if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE)
&& (images.size() >= query.neededResults() + 30)) break;
+ if ((query.contentdom != plasmaSearchQuery.CONTENTDOM_IMAGE)
&& (result.size() >= query.neededResults() + 10 /*+ query.displayResults()*/))
break;
// get next entry
page = rankedCache.bestURL(true);
@@ -558,7 +563,7 @@
}
public ResultEntry oneResult(int item) {
- // first sleep a while to give accumulation threads a chance to work
+ // check if we already retrieved this item (happens if a search pages
is accessed a second time)
if (this.result.sizeStore() > item) {
// we have the wanted result already in the result array .. return
that
return this.result.element(item).element;
@@ -589,6 +594,43 @@
return this.result.element(item).element;
}
+ private int resultCounter = 0;
+ public ResultEntry nextResult() {
+ ResultEntry re = oneResult(resultCounter);
+ resultCounter++;
+ return re;
+ }
+
+ public plasmaSnippetCache.MediaSnippet oneImage(int item) {
+ // check if we already retrieved this item (happens if a search pages
is accessed a second time)
+ if (this.images.sizeStore() > item) {
+ // we have the wanted result already in the result array .. return
that
+ return this.images.element(item).element;
+ }
+
+ // feed some results from the result stack into the image stack
+ int count = Math.min(5, Math.max(1, 10 * this.result.size() / (item +
1)));
+ for (int i = 0; i < count; i++) {
+ // generate result object
+ plasmaSearchEvent.ResultEntry result = nextResult();
+ plasmaSnippetCache.MediaSnippet ms;
+ if (result != null) {
+ // iterate over all images in the result
+ ArrayList<plasmaSnippetCache.MediaSnippet> imagemedia =
result.mediaSnippets();
+ if (imagemedia != null) {
+ for (int j = 0; j < imagemedia.size(); j++) {
+ ms = imagemedia.get(j);
+ images.push(ms, ms.ranking);
+ }
+ }
+ }
+ }
+
+ // now take the specific item from the image stack
+ if (this.images.size() <= item) return null;
+ return this.images.element(item).element;
+ }
+
public ArrayList<kelondroSortStack<ResultEntry>.stackElement>
completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime;
while ((result.size() < query.neededResults()) && (anyWorkerAlive())
&& (System.currentTimeMillis() < timeout)) {
Modified: trunk/source/de/anomic/plasma/plasmaSnippetCache.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSnippetCache.java 2008-02-21
10:06:57 UTC (rev 4498)
+++ trunk/source/de/anomic/plasma/plasmaSnippetCache.java 2008-02-21
14:53:51 UTC (rev 4499)
@@ -232,11 +232,13 @@
public int type;
public yacyURL href;
public String name, attr;
- public MediaSnippet(int type, yacyURL href, String name, String attr) {
+ public int ranking;
+ public MediaSnippet(int type, yacyURL href, String name, String attr,
int ranking) {
this.type = type;
this.href = href;
this.name = name;
this.attr = attr;
+ this.ranking = ranking; // the smaller the better! small values
should be shown first
if ((this.name == null) || (this.name.length() == 0)) this.name =
"_";
if ((this.attr == null) || (this.attr.length() == 0)) this.attr =
"_";
}
@@ -677,12 +679,12 @@
desc = entry.getValue();
s = removeAppearanceHashes(url.toNormalform(false, false),
queryhashes);
if (s.size() == 0) {
- result.add(new MediaSnippet(mediatype, url, desc, null));
+ result.add(new MediaSnippet(mediatype, url, desc, null, 0));
continue;
}
s = removeAppearanceHashes(desc, s);
if (s.size() == 0) {
- result.add(new MediaSnippet(mediatype, url, desc, null));
+ result.add(new MediaSnippet(mediatype, url, desc, null, 0));
continue;
}
}
@@ -691,7 +693,8 @@
public static ArrayList<MediaSnippet>
computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
- TreeSet<htmlFilterImageEntry> images = document.getImages();
+ TreeSet<htmlFilterImageEntry> images = document.getImages(); //
iterates images in descending size order!
+ // a measurement for the size of the images can be retrieved using the
htmlFilterImageEntry.hashCode()
Iterator<htmlFilterImageEntry> i = images.iterator();
htmlFilterImageEntry ientry;
@@ -705,12 +708,16 @@
desc = ientry.alt();
s = removeAppearanceHashes(url.toNormalform(false, false),
queryhashes);
if (s.size() == 0) {
- result.add(new
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + "
x " + ientry.height()));
+ int ranking = ientry.hashCode();
+ System.out.println(ranking);
+ result.add(new
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + "
x " + ientry.height(), ranking));
continue;
}
s = removeAppearanceHashes(desc, s);
if (s.size() == 0) {
- result.add(new
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + "
x " + ientry.height()));
+ int ranking = ientry.hashCode();
+ System.out.println(ranking);
+ result.add(new
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + "
x " + ientry.height(), ranking));
continue;
}
}
_______________________________________________
YaCy-svn mailing list
[email protected]
https://lists.berlios.de/mailman/listinfo/yacy-svn