Author: orbiter
Date: 2008-02-21 15:53:51 +0100 (Thu, 21 Feb 2008)
New Revision: 4499

Modified:
   trunk/htroot/yacy/user/ysearchitem.html
   trunk/htroot/yacy/user/ysearchitem.java
   trunk/htroot/yacysearchitem.html
   trunk/htroot/yacysearchitem.java
   trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java
   trunk/source/de/anomic/plasma/plasmaSearchEvent.java
   trunk/source/de/anomic/plasma/plasmaSnippetCache.java
Log:
- added image sorting by image size. This is the default now.
  This is performed using a 3-stage sorting process:
  - sort by relevance, then do snippet-fetch
  - sort snippets by relevance then do image link extraction
  - sort image links by image size; unknown sizes are handled like small sizes
- only the exact amount of images as requested are shown

Modified: trunk/htroot/yacy/user/ysearchitem.html
===================================================================
--- trunk/htroot/yacy/user/ysearchitem.html     2008-02-21 10:06:57 UTC (rev 
4498)
+++ trunk/htroot/yacy/user/ysearchitem.html     2008-02-21 14:53:51 UTC (rev 
4499)
@@ -13,7 +13,7 @@
       <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" 
alt="#[name]#">
     </a>
     <div class="highslide-caption"><a href="#[href]#">#[name]#</a></div>
-    <div class="snippet"><a href="#[href]#">#[name]#</a></div>
+    <div class="snippet"><a href="#[href]#">#[name]##[attr]#</a></div>
   </div>
   #{/items}#
   ::

Modified: trunk/htroot/yacy/user/ysearchitem.java
===================================================================
--- trunk/htroot/yacy/user/ysearchitem.java     2008-02-21 10:06:57 UTC (rev 
4498)
+++ trunk/htroot/yacy/user/ysearchitem.java     2008-02-21 14:53:51 UTC (rev 
4499)
@@ -164,35 +164,32 @@
             
             return prop;
         }
-        
-        // generate result object
-        plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
-        
-        if (result == null) {
-            // no content
-            return prop;
-        }
-            
-        if (rss) {
-            // text search for rss output
-            prop.put("rss", "1"); // switch on specific content
-            prop.putHTML("rss_title", result.title(), true);
-            prop.putHTML("rss_description", result.textSnippet().getLineRaw(), 
true);
-            prop.putHTML("rss_link", result.urlstring(), true);
-            prop.put("rss_urlhash", result.hash());
-            prop.put("rss_date", 
plasmaSwitchboard.dateString822(result.modified()));
-            return prop;
-        }
-        
+
         prop.put("rss", "0");
         
         if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
             // text search
+
+            // generate result object
+            plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+            if (result == null) return prop; // no content
+                
+            if (rss) {
+                // text search for rss output
+                prop.put("rss", "1"); // switch on specific content
+                prop.putHTML("rss_title", result.title(), true);
+                prop.putHTML("rss_description", 
result.textSnippet().getLineRaw(), true);
+                prop.putHTML("rss_link", result.urlstring(), true);
+                prop.put("rss_urlhash", result.hash());
+                prop.put("rss_date", 
plasmaSwitchboard.dateString822(result.modified()));
+                return prop;
+            }
+            
             prop.put("content", theQuery.contentdom + 1); // switch on 
specific content
             prop.put("content_authorized", authenticated ? "1" : "0");
             prop.put("content_authorized_recommend", 
(yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, 
yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : 
"0");
-            prop.put("content_authorized_recommend_deletelink", 
"/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + 
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - 
theQuery.displayResults()) + "&order=" + 
crypt.simpleEncode(theQuery.ranking.toExternalString()) + 
"&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
-            prop.put("content_authorized_recommend_recommendlink", 
"/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + 
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - 
theQuery.displayResults()) + "&order=" + 
crypt.simpleEncode(theQuery.ranking.toExternalString()) + 
"&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
+            prop.put("content_authorized_recommend_deletelink", 
"/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + 
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - 
theQuery.displayResults()) + "&order=" + 
crypt.simpleEncode(theQuery.ranking.toExternalString()) + 
"&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
+            prop.put("content_authorized_recommend_recommendlink", 
"/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + 
theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - 
theQuery.displayResults()) + "&order=" + 
crypt.simpleEncode(theQuery.ranking.toExternalString()) + 
"&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
             prop.put("content_authorized_urlhash", result.hash());
             prop.putHTML("content_description", result.title());
             prop.put("content_url", result.urlstring());
@@ -229,23 +226,17 @@
         
         if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) {
             // image search; shows thumbnails
-            // iterate over all images in the result
+
             prop.put("content", theQuery.contentdom + 1); // switch on 
specific content
-            ArrayList<plasmaSnippetCache.MediaSnippet> images = 
result.mediaSnippets();
-            if (images != null) {
-                plasmaSnippetCache.MediaSnippet ms;
-                int c = 0;
-                for (int i = 0; i < images.size(); i++) {
-                    ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
-                    prop.putHTML("content_items_" + i + "_href", 
ms.href.toNormalform(true, false));
-                    prop.put("content_items_" + i + "_code", 
sb.licensedURLs.aquireLicense(ms.href));
-                    prop.putHTML("content_items_" + i + "_name", 
shorten(ms.name, namelength));
-                    prop.put("content_items_" + i + "_attr", ms.attr); // 
attributes, here: original size of image
-                    c++;
-                }
-                prop.put("content_items", c);
-            } else {
+            plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item);
+            if (ms == null) {
                 prop.put("content_items", "0");
+            } else {
+                prop.putHTML("content_items_0_href", 
ms.href.toNormalform(true, false));
+                prop.put("content_items_0_code", 
sb.licensedURLs.aquireLicense(ms.href));
+                prop.putHTML("content_items_0_name", shorten(ms.name, 
namelength));
+                prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ? 
"" : " (" + ms.attr + ")"); // attributes, here: original size of image
+                prop.put("content_items", 1);
             }
             return prop;
         }
@@ -254,6 +245,11 @@
             (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ||
             (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) {
             // any other media content
+
+            // generate result object
+            plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+            if (result == null) return prop; // no content
+            
             prop.put("content", theQuery.contentdom + 1); // switch on 
specific content
             ArrayList<plasmaSnippetCache.MediaSnippet> media = 
result.mediaSnippets();
             if (item == 0) col = true;

Modified: trunk/htroot/yacysearchitem.html
===================================================================
--- trunk/htroot/yacysearchitem.html    2008-02-21 10:06:57 UTC (rev 4498)
+++ trunk/htroot/yacysearchitem.html    2008-02-21 14:53:51 UTC (rev 4499)
@@ -26,7 +26,7 @@
       <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" 
alt="#[name]#">
     </a>
     <div class="highslide-caption"><a href="#[href]#">#[name]#</a></div>
-    <div class="TableCellDark"><a href="#[href]#">#[name]#</a></div>
+    <div class="TableCellDark"><a href="#[href]#">#[name]##[attr]#</a></div>
   </div>
   #{/items}#
   ::

Modified: trunk/htroot/yacysearchitem.java
===================================================================
--- trunk/htroot/yacysearchitem.java    2008-02-21 10:06:57 UTC (rev 4498)
+++ trunk/htroot/yacysearchitem.java    2008-02-21 14:53:51 UTC (rev 4499)
@@ -166,29 +166,26 @@
             return prop;
         }
         
-        // generate result object
-        plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
-        
-        if (result == null) {
-            // no content
-            return prop;
-        }
-            
-        if (rss) {
-            // text search for rss output
-            prop.put("rss", "1"); // switch on specific content
-            prop.putHTML("rss_title", result.title(), true);
-            prop.putHTML("rss_description", result.textSnippet().getLineRaw(), 
true);
-            prop.putHTML("rss_link", result.urlstring(), true);
-            prop.put("rss_urlhash", result.hash());
-            prop.put("rss_date", 
plasmaSwitchboard.dateString822(result.modified()));
-            return prop;
-        }
-        
         prop.put("rss", "0");
         
         if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
             // text search
+
+            // generate result object
+            plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+            if (result == null) return prop; // no content
+                
+            if (rss) {
+                // text search for rss output
+                prop.put("rss", "1"); // switch on specific content
+                prop.putHTML("rss_title", result.title(), true);
+                prop.putHTML("rss_description", 
result.textSnippet().getLineRaw(), true);
+                prop.putHTML("rss_link", result.urlstring(), true);
+                prop.put("rss_urlhash", result.hash());
+                prop.put("rss_date", 
plasmaSwitchboard.dateString822(result.modified()));
+                return prop;
+            }
+            
             prop.put("content", theQuery.contentdom + 1); // switch on 
specific content
             prop.put("content_authorized", authenticated ? "1" : "0");
             prop.put("content_authorized_recommend", 
(yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, 
yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : 
"0");
@@ -230,23 +227,17 @@
         
         if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) {
             // image search; shows thumbnails
-            // iterate over all images in the result
+
             prop.put("content", theQuery.contentdom + 1); // switch on 
specific content
-            ArrayList<plasmaSnippetCache.MediaSnippet> images = 
result.mediaSnippets();
-            if (images != null) {
-                plasmaSnippetCache.MediaSnippet ms;
-                int c = 0;
-                for (int i = 0; i < images.size(); i++) {
-                    ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
-                    prop.putHTML("content_items_" + i + "_href", 
ms.href.toNormalform(true, false));
-                    prop.put("content_items_" + i + "_code", 
sb.licensedURLs.aquireLicense(ms.href));
-                    prop.putHTML("content_items_" + i + "_name", 
shorten(ms.name, namelength));
-                    prop.put("content_items_" + i + "_attr", ms.attr); // 
attributes, here: original size of image
-                    c++;
-                }
-                prop.put("content_items", c);
-            } else {
+            plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item);
+            if (ms == null) {
                 prop.put("content_items", "0");
+            } else {
+                prop.putHTML("content_items_0_href", 
ms.href.toNormalform(true, false));
+                prop.put("content_items_0_code", 
sb.licensedURLs.aquireLicense(ms.href));
+                prop.putHTML("content_items_0_name", shorten(ms.name, 
namelength));
+                prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ? 
"" : " (" + ms.attr + ")"); // attributes, here: original size of image
+                prop.put("content_items", 1);
             }
             return prop;
         }
@@ -255,6 +246,11 @@
             (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ||
             (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) {
             // any other media content
+
+            // generate result object
+            plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
+            if (result == null) return prop; // no content
+            
             prop.put("content", theQuery.contentdom + 1); // switch on 
specific content
             ArrayList<plasmaSnippetCache.MediaSnippet> media = 
result.mediaSnippets();
             if (item == 0) col = true;

Modified: trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java
===================================================================
--- trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java 2008-02-21 
10:06:57 UTC (rev 4498)
+++ trunk/source/de/anomic/htmlFilter/htmlFilterImageEntry.java 2008-02-21 
14:53:51 UTC (rev 4499)
@@ -80,10 +80,10 @@
         // this hash method therefore tries to compute a 'perfect hash' based 
on the size of the images
         // unfortunately it can not be ensured that all images get different 
hashes, but this should appear
         // only in very rare cases
-        if ((width > 0) && (height > 0))
-            return ((0xFFFF - (((width * height) >> 8) & 0xFFFF)) << 16) | 
(url.hashCode() & 0xFFFF);
+        if ((width >= 0) && (height >= 0))
+            return ((0x7FFF - (((width * height) >> 9) & 0x7FFF)) << 16) | 
(url.hashCode() & 0xFFFF);
         else
-            return 0xFFFF0000 | (url.hashCode() & 0xFFFF);
+            return 0x7FFF0000 | (url.hashCode() & 0xFFFF);
     }
     
     public int compareTo(htmlFilterImageEntry h) {

Modified: trunk/source/de/anomic/plasma/plasmaSearchEvent.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSearchEvent.java        2008-02-21 
10:06:57 UTC (rev 4498)
+++ trunk/source/de/anomic/plasma/plasmaSearchEvent.java        2008-02-21 
14:53:51 UTC (rev 4499)
@@ -81,6 +81,7 @@
     public  String IAmaxcounthash, IAneardhthash;
     private resultWorker[] workerThreads;
     private kelondroSortStore<ResultEntry> result;
+    private kelondroSortStore<plasmaSnippetCache.MediaSnippet> images; // 
container to sort images by size
     private HashMap<String, String> failedURLs; // a mapping from a urlhash to 
a fail reason string
     TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are 
used to match with the snippets
     private long urlRetrievalAllTime;
@@ -107,6 +108,7 @@
         this.workerThreads = null;
         this.localSearchThread = null;
         this.result = new kelondroSortStore<ResultEntry>(-1); // this is the 
result, enriched with snippets, ranked and ordered by ranking
+        this.images = new 
kelondroSortStore<plasmaSnippetCache.MediaSnippet>(-1);
         this.failedURLs = new HashMap<String, String>(); // a map of urls to 
reason strings where a worker thread tried to work on, but failed.
         
         // snippets do not need to match with the complete query hashes,
@@ -465,7 +467,8 @@
             // if worker threads had been alive, but did not succeed, start 
them again to fetch missing links
             if ((query.onlineSnippetFetch) &&
                 (!event.anyWorkerAlive()) &&
-                (event.result.size() < query.neededResults() + 10) &&
+                (((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && 
(event.images.size() + 30 < query.neededResults())) ||
+                 (event.result.size() < query.neededResults() + 10)) &&
                 (event.getRankingResult().getLocalResourceSize() + 
event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
                 // set new timeout
                 event.eventTime = System.currentTimeMillis();
@@ -507,7 +510,9 @@
             while (System.currentTimeMillis() < this.timeout) {
                 this.lastLifeSign = System.currentTimeMillis();
 
-                if (result.size() >= query.neededResults() /*+ 
query.displayResults()*/) break; // we have enough
+                // check if we have enough
+                if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) 
&& (images.size() >= query.neededResults() + 30)) break;
+                if ((query.contentdom != plasmaSearchQuery.CONTENTDOM_IMAGE) 
&& (result.size() >= query.neededResults() + 10 /*+ query.displayResults()*/)) 
break;
 
                 // get next entry
                 page = rankedCache.bestURL(true);
@@ -558,7 +563,7 @@
     }
     
     public ResultEntry oneResult(int item) {
-        // first sleep a while to give accumulation threads a chance to work
+        // check if we already retrieved this item (happens if a search pages 
is accessed a second time)
         if (this.result.sizeStore() > item) {
             // we have the wanted result already in the result array .. return 
that
             return this.result.element(item).element;
@@ -589,6 +594,43 @@
         return this.result.element(item).element;
     }
 
+    private int resultCounter = 0;
+    public ResultEntry nextResult() {
+        ResultEntry re = oneResult(resultCounter);
+        resultCounter++;
+        return re;
+    }
+    
+    public plasmaSnippetCache.MediaSnippet oneImage(int item) {
+        // check if we already retrieved this item (happens if a search pages 
is accessed a second time)
+        if (this.images.sizeStore() > item) {
+            // we have the wanted result already in the result array .. return 
that
+            return this.images.element(item).element;
+        }
+        
+        // feed some results from the result stack into the image stack
+        int count = Math.min(5, Math.max(1, 10 * this.result.size() / (item + 
1)));
+        for (int i = 0; i < count; i++) {
+            // generate result object
+            plasmaSearchEvent.ResultEntry result = nextResult();
+            plasmaSnippetCache.MediaSnippet ms;
+            if (result != null) {
+                // iterate over all images in the result
+                ArrayList<plasmaSnippetCache.MediaSnippet> imagemedia = 
result.mediaSnippets();
+                if (imagemedia != null) {
+                    for (int j = 0; j < imagemedia.size(); j++) {
+                        ms = imagemedia.get(j);
+                        images.push(ms, ms.ranking);
+                    }
+                }
+            }
+        }
+        
+        // now take the specific item from the image stack
+        if (this.images.size() <= item) return null;
+        return this.images.element(item).element;
+    }
+    
     public ArrayList<kelondroSortStack<ResultEntry>.stackElement> 
completeResults(long waitingtime) {
         long timeout = System.currentTimeMillis() + waitingtime;
         while ((result.size() < query.neededResults()) && (anyWorkerAlive()) 
&& (System.currentTimeMillis() < timeout)) {

Modified: trunk/source/de/anomic/plasma/plasmaSnippetCache.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaSnippetCache.java       2008-02-21 
10:06:57 UTC (rev 4498)
+++ trunk/source/de/anomic/plasma/plasmaSnippetCache.java       2008-02-21 
14:53:51 UTC (rev 4499)
@@ -232,11 +232,13 @@
         public int type;
         public yacyURL href;
         public String name, attr;
-        public MediaSnippet(int type, yacyURL href, String name, String attr) {
+        public int ranking;
+        public MediaSnippet(int type, yacyURL href, String name, String attr, 
int ranking) {
             this.type = type;
             this.href = href;
             this.name = name;
             this.attr = attr;
+            this.ranking = ranking; // the smaller the better! small values 
should be shown first
             if ((this.name == null) || (this.name.length() == 0)) this.name = 
"_";
             if ((this.attr == null) || (this.attr.length() == 0)) this.attr = 
"_";
         }
@@ -677,12 +679,12 @@
             desc = entry.getValue();
             s = removeAppearanceHashes(url.toNormalform(false, false), 
queryhashes);
             if (s.size() == 0) {
-                result.add(new MediaSnippet(mediatype, url, desc, null));
+                result.add(new MediaSnippet(mediatype, url, desc, null, 0));
                 continue;
             }
             s = removeAppearanceHashes(desc, s);
             if (s.size() == 0) {
-                result.add(new MediaSnippet(mediatype, url, desc, null));
+                result.add(new MediaSnippet(mediatype, url, desc, null, 0));
                 continue;
             }
         }
@@ -691,7 +693,8 @@
     
     public static ArrayList<MediaSnippet> 
computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
         
-        TreeSet<htmlFilterImageEntry> images = document.getImages();
+        TreeSet<htmlFilterImageEntry> images = document.getImages(); // 
iterates images in descending size order!
+        // a measurement for the size of the images can be retrieved using the 
htmlFilterImageEntry.hashCode()
         
         Iterator<htmlFilterImageEntry> i = images.iterator();
         htmlFilterImageEntry ientry;
@@ -705,12 +708,16 @@
             desc = ientry.alt();
             s = removeAppearanceHashes(url.toNormalform(false, false), 
queryhashes);
             if (s.size() == 0) {
-                result.add(new 
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " 
x " + ientry.height()));
+                int ranking = ientry.hashCode();
+                System.out.println(ranking);
+                result.add(new 
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " 
x " + ientry.height(), ranking));
                 continue;
             }
             s = removeAppearanceHashes(desc, s);
             if (s.size() == 0) {
-                result.add(new 
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " 
x " + ientry.height()));
+                int ranking = ientry.hashCode();
+                System.out.println(ranking);
+                result.add(new 
MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " 
x " + ientry.height(), ranking));
                 continue;
             }
         }

_______________________________________________
YaCy-svn mailing list
YaCy-svn@lists.berlios.de
https://lists.berlios.de/mailman/listinfo/yacy-svn

Antwort per Email an