Author: hermens
Date: 2006-03-10 17:28:01 +0100 (Fri, 10 Mar 2006)
New Revision: 1874

Modified:
   trunk/source/de/anomic/plasma/plasmaCrawlLURL.java
   trunk/source/de/anomic/plasma/plasmaWordIndexEntity.java
   trunk/source/de/anomic/plasma/plasmaWordIndexEntry.java
   trunk/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
Log:
*) Don't overwrite new entries with older ones
   see: http://www.yacy-forum.de/viewtopic.php?t=2015



Modified: trunk/source/de/anomic/plasma/plasmaCrawlLURL.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlLURL.java  2006-03-10 13:57:30 UTC 
(rev 1873)
+++ trunk/source/de/anomic/plasma/plasmaCrawlLURL.java  2006-03-10 16:28:01 UTC 
(rev 1874)
@@ -384,7 +384,7 @@
 
     public class Entry {
 
-    private URL url;
+        private URL url;
 
         private String descr;
         private Date moddate;
@@ -465,171 +465,214 @@
             }
         }
 
-    public Entry(Properties prop, boolean setGlobal) {
-        // generates an plasmaLURLEntry using the properties from the argument
-        // the property names must correspond to the one from toString
-        //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
-        this.urlHash = prop.getProperty("hash", dummyHash);
-        try {
-        //byte[][] entry = urlHashCache.get(urlHash.getBytes());
-        //if (entry == null) {
-            this.referrerHash = prop.getProperty("referrer", dummyHash);
-            this.moddate = shortDayFormatter.parse(prop.getProperty("mod", 
"20000101"));
-            //System.out.println("DEBUG: moddate = " + moddate + ", prop=" + 
prop.getProperty("mod"));
-            this.loaddate = shortDayFormatter.parse(prop.getProperty("load", 
"20000101"));
-            this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
-            this.flags = ((prop.getProperty("local", "true").equals("true")) ? 
"L " : "  ");
-            if (setGlobal) this.flags = "G ";
-            this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), 
null));
-            this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), 
null);
-                    if (this.descr == null) this.descr = this.url.toString();
-            this.quality = (int) 
kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
-            this.language = prop.getProperty("lang", "uk");
-            this.doctype = prop.getProperty("dt", "t").charAt(0);
-            this.size = Integer.parseInt(prop.getProperty("size", "0"));
-            this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
-            this.snippet = prop.getProperty("snippet", "");
-            if (snippet.length() == 0) snippet = null; else snippet = 
crypt.simpleDecode(snippet, null);
-            this.word = (prop.containsKey("word")) ? new 
plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word","")))
 : null;
-            store();
-            //}
-        } catch (Exception e) {
-        serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " 
+ e.toString(), e);
+        public Entry(Properties prop, boolean setGlobal) {
+            // generates an plasmaLURLEntry using the properties from the 
argument
+            // the property names must correspond to the one from toString
+            //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
+            this.urlHash = prop.getProperty("hash", dummyHash);
+            try {
+                //byte[][] entry = urlHashCache.get(urlHash.getBytes());
+                //if (entry == null) {
+                this.referrerHash = prop.getProperty("referrer", dummyHash);
+                this.moddate = shortDayFormatter.parse(prop.getProperty("mod", 
"20000101"));
+                //System.out.println("DEBUG: moddate = " + moddate + ", prop=" 
+ prop.getProperty("mod"));
+                this.loaddate = 
shortDayFormatter.parse(prop.getProperty("load", "20000101"));
+                this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
+                this.flags = ((prop.getProperty("local", 
"true").equals("true")) ? "L " : "  ");
+                if (setGlobal) this.flags = "G ";
+                this.url = new URL(crypt.simpleDecode(prop.getProperty("url", 
""), null));
+                this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), 
null);
+                        if (this.descr == null) this.descr = 
this.url.toString();
+                this.quality = (int) 
kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
+                this.language = prop.getProperty("lang", "uk");
+                this.doctype = prop.getProperty("dt", "t").charAt(0);
+                this.size = Integer.parseInt(prop.getProperty("size", "0"));
+                this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
+                this.snippet = prop.getProperty("snippet", "");
+                if (snippet.length() == 0) snippet = null; else snippet = 
crypt.simpleDecode(snippet, null);
+                this.word = (prop.containsKey("word")) ? new 
plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word","")))
 : null;
+                store();
+                //}
+            } catch (Exception e) {
+                serverLog.logSevere("PLASMA", "INTERNAL ERROR in 
plasmaLURL.entry/2: " + e.toString(), e);
+            }
         }
-    }
 
-    private void store() {
-        // stores the values from the object variables into the database
-        final String moddatestr = 
kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, 
urlDateLength);
-        final String loaddatestr = 
kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, 
urlDateLength);
+        private void store() {
+            // Check if there is a more recent Entry already in the DB
+            Entry oldEntry;
+            try {
+                if (exists(urlHash)) {
+                    oldEntry = new Entry (urlHash, null);
+                } else {
+                    oldEntry = null;
+                }
+            } catch (Exception e) {
+                oldEntry = null;
+            }
+            if ((oldEntry != null) && (isOlder(oldEntry))) {
+                // the fetched oldEntry is better, so return its properties 
instead of the new ones
+                // this.urlHash = oldEntry.urlHash; // unnecessary, should be 
the same
+                // this.url = oldEntry.url; // unnecessary, should be the same
+                this.descr = oldEntry.descr;
+                this.moddate = oldEntry.moddate;
+                this.loaddate = oldEntry.loaddate;
+                this.referrerHash = oldEntry.referrerHash;
+                this.copyCount = oldEntry.copyCount;
+                this.flags =  oldEntry.flags;
+                this.quality = oldEntry.quality;
+                this.language = oldEntry.language;
+                this.doctype = oldEntry.doctype;
+                this.size = oldEntry.size;
+                this.wordCount = oldEntry.wordCount;
+                // this.snippet // not read from db
+                // this.word // not read from db
+                return;
+            }
 
-        // store the hash in the hash cache
-        try {
-            // even if the entry exists, we simply overwrite it
-            final byte[][] entry = new byte[][] {
-            urlHash.getBytes(),
-            url.toString().getBytes(),
-            descr.getBytes(), // null?
-            moddatestr.getBytes(),
-            loaddatestr.getBytes(),
-            referrerHash.getBytes(),
-            kelondroBase64Order.enhancedCoder.encodeLong(copyCount, 
urlCopyCountLength).getBytes(),
-            flags.getBytes(),
-            kelondroBase64Order.enhancedCoder.encodeLong(quality, 
urlQualityLength).getBytes(),
-            language.getBytes(),
-            new byte[] {(byte) doctype},
-            kelondroBase64Order.enhancedCoder.encodeLong(size, 
urlSizeLength).getBytes(),
-            kelondroBase64Order.enhancedCoder.encodeLong(wordCount, 
urlWordCountLength).getBytes(),
-        };
-        urlHashCache.put(entry);
-        } catch (Exception e) {
-        serverLog.logSevere("PLASMA", "INTERNAL ERROR AT 
plasmaCrawlLURL:store:" + e.toString(), e);
+            // stores the values from the object variables into the database
+            final String moddatestr = 
kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, 
urlDateLength);
+            final String loaddatestr = 
kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, 
urlDateLength);
+
+            // store the hash in the hash cache
+            try {
+                // even if the entry exists, we simply overwrite it
+                final byte[][] entry = new byte[][] {
+                    urlHash.getBytes(),
+                    url.toString().getBytes(),
+                    descr.getBytes(), // null?
+                    moddatestr.getBytes(),
+                    loaddatestr.getBytes(),
+                    referrerHash.getBytes(),
+                    kelondroBase64Order.enhancedCoder.encodeLong(copyCount, 
urlCopyCountLength).getBytes(),
+                    flags.getBytes(),
+                    kelondroBase64Order.enhancedCoder.encodeLong(quality, 
urlQualityLength).getBytes(),
+                    language.getBytes(),
+                    new byte[] {(byte) doctype},
+                    kelondroBase64Order.enhancedCoder.encodeLong(size, 
urlSizeLength).getBytes(),
+                    kelondroBase64Order.enhancedCoder.encodeLong(wordCount, 
urlWordCountLength).getBytes(),
+                };
+                urlHashCache.put(entry);
+            } catch (Exception e) {
+                serverLog.logSevere("PLASMA", "INTERNAL ERROR AT 
plasmaCrawlLURL:store:" + e.toString(), e);
+            }
         }
-    }
 
-    public String hash() {
-        // return a url-hash, based on the md5 algorithm
-        // the result is a String of 12 bytes within a 72-bit space
-        // (each byte has an 6-bit range)
-        // that should be enough for all web pages on the world
-        return this.urlHash;
-    }
+        public String hash() {
+            // return a url-hash, based on the md5 algorithm
+            // the result is a String of 12 bytes within a 72-bit space
+            // (each byte has an 6-bit range)
+            // that should be enough for all web pages on the world
+            return this.urlHash;
+        }
 
-    public URL url() {
-        return url;
-    }
+        public URL url() {
+            return url;
+        }
 
-    public String descr() {
-        return descr;
-    }
+        public String descr() {
+            return descr;
+        }
 
-    public Date moddate() {
-        return moddate;
-    }
+        public Date moddate() {
+            return moddate;
+        }
 
-    public Date loaddate() {
-        return loaddate;
-    }
+        public Date loaddate() {
+            return loaddate;
+        }
 
-    public String referrerHash() {
-        // return the creator's hash
-        return referrerHash;
-    }
+        public String referrerHash() {
+            // return the creator's hash
+            return referrerHash;
+        }
 
-    public char doctype() {
-        return doctype;
-    }
+        public char doctype() {
+            return doctype;
+        }
 
-    public int copyCount() {
-        // return number of copies of this object in the global index
-        return copyCount;
-    }
+        public int copyCount() {
+            // return number of copies of this object in the global index
+            return copyCount;
+        }
 
-    public boolean local() {
-        // returns true if the url was created locally and is needed for own 
word index
+        public boolean local() {
+            // returns true if the url was created locally and is needed for 
own word index
             if (flags == null) return false;
             return flags.charAt(0) == 'L';
-    }
+        }
 
-    public int quality() {
-        return quality;
-    }
+        public int quality() {
+            return quality;
+        }
 
-    public String language() {
-        return language;
-    }
+        public String language() {
+            return language;
+        }
 
-    public int size() {
-        return size;
-    }
+        public int size() {
+            return size;
+        }
 
-    public int wordCount() {
-        return wordCount;
-    }
+        public int wordCount() {
+            return wordCount;
+        }
 
-    public String snippet() {
-        // the snippet may appear here if the url was transported in a remote 
search
-        // it will not be saved anywhere, but can only be requested here
-        return snippet;
-    }
+        public String snippet() {
+            // the snippet may appear here if the url was transported in a 
remote search
+            // it will not be saved anywhere, but can only be requested here
+            return snippet;
+        }
 
-    public plasmaWordIndexEntry word() {
-        return word;
-    }
+        public plasmaWordIndexEntry word() {
+            return word;
+        }
     
-    private StringBuffer corePropList() {
-        // generate a parseable string; this is a simple property-list
-        final StringBuffer corePropStr = new StringBuffer(300);
-        try {
-            corePropStr
-            .append("hash=")     .append(urlHash)
-            .append(",referrer=").append(referrerHash)
-            .append(",mod=")     .append(shortDayFormatter.format(moddate))
-            .append(",load=")    .append(shortDayFormatter.format(loaddate))
-            .append(",size=")    .append(size)
-            .append(",wc=")      .append(wordCount)
-            .append(",cc=")      .append(copyCount)
-            .append(",local=")   .append(((local()) ? "true" : "false"))
-            .append(",q=")       
.append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
-            .append(",dt=")      .append(doctype)
-            .append(",lang=")    .append(language)
-            .append(",url=")     .append(crypt.simpleEncode(url.toString()))
-            .append(",descr=")   .append(crypt.simpleEncode(descr));
-            
-            if (this.word != null) {
-                // append also word properties
-                
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm()));
+        public boolean isOlder (Entry other) {
+            if (other == null) return false;
+            if (moddate.before(other.moddate())) return true;
+            if (moddate.equals(other.moddate())) {
+                if (loaddate.before(other.loaddate())) return true;
+                if (loaddate.equals(other.loaddate())) {
+                    if (quality < other.quality()) return true;
+                }
             }
-            return corePropStr;
+            return false;
+        }
 
-        } catch (Exception e) {
+        private StringBuffer corePropList() {
+            // generate a parseable string; this is a simple property-list
+            final StringBuffer corePropStr = new StringBuffer(300);
+            try {
+                corePropStr
+                .append("hash=")     .append(urlHash)
+                .append(",referrer=").append(referrerHash)
+                .append(",mod=")     .append(shortDayFormatter.format(moddate))
+                .append(",load=")    
.append(shortDayFormatter.format(loaddate))
+                .append(",size=")    .append(size)
+                .append(",wc=")      .append(wordCount)
+                .append(",cc=")      .append(copyCount)
+                .append(",local=")   .append(((local()) ? "true" : "false"))
+                .append(",q=")       
.append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
+                .append(",dt=")      .append(doctype)
+                .append(",lang=")    .append(language)
+                .append(",url=")     
.append(crypt.simpleEncode(url.toString()))
+                .append(",descr=")   .append(crypt.simpleEncode(descr));
+
+                if (this.word != null) {
+                    // append also word properties
+                    
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm()));
+                }
+                return corePropStr;
+
+            } catch (Exception e) {
 //          serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
 //          if (moddate == null) 
serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
 //          if (loaddate == null) 
serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
 //          e.printStackTrace();
-            return null;
+                return null;
+            }
         }
-    }
 
     /*
     public String toString(int posintext, int posinphrase, int posofphrase) {
@@ -647,48 +690,48 @@
     }        
     */
     
-    public String toString(String snippet) {
-        // add information needed for remote transport
-        final StringBuffer core = corePropList();
-        if (core == null) return null;
+        public String toString(String snippet) {
+            // add information needed for remote transport
+            final StringBuffer core = corePropList();
+            if (core == null) return null;
 
-        core.ensureCapacity(core.length() + snippet.length()*2);
-        core.insert(0,"{");
-        core.append(",snippet=").append(crypt.simpleEncode(snippet));
-        core.append("}");
+            core.ensureCapacity(core.length() + snippet.length()*2);
+            core.insert(0,"{");
+            core.append(",snippet=").append(crypt.simpleEncode(snippet));
+            core.append("}");
 
-        return core.toString();        
-        //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
-    }
+            return core.toString();        
+            //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + 
"}";
+        }
 
-    /**
-     * Returns this object as String.<br> 
-     * This e.g. looks like this:
-     * 
<pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
-     */
-    public String toString() {
-        final StringBuffer core = corePropList();
-        if (core == null) return null;
+        /**
+         * Returns this object as String.<br> 
+         * This e.g. looks like this:
+         * 
<pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
+         */
+        public String toString() {
+            final StringBuffer core = corePropList();
+            if (core == null) return null;
 
-        core.insert(0,"{");
-        core.append("}");
+            core.insert(0,"{");
+            core.append("}");
 
-        return core.toString();
-        //return "{" + core + "}";
-    }
+            return core.toString();
+            //return "{" + core + "}";
+        }
 
-    public void print() {
-        System.out.println("URL           : " + url);
-        System.out.println("Description   : " + descr);
-        System.out.println("Modified      : " + httpc.dateString(moddate));
-        System.out.println("Loaded        : " + httpc.dateString(loaddate));
-        System.out.println("Size          : " + size + " bytes, " + wordCount 
+ " words");
-        System.out.println("Referrer Hash : " + referrerHash);
-        System.out.println("Quality       : " + quality);
-        System.out.println("Language      : " + language);
-        System.out.println("DocType       : " + doctype);
-        System.out.println();
-    }
+        public void print() {
+            System.out.println("URL           : " + url);
+            System.out.println("Description   : " + descr);
+            System.out.println("Modified      : " + httpc.dateString(moddate));
+            System.out.println("Loaded        : " + 
httpc.dateString(loaddate));
+            System.out.println("Size          : " + size + " bytes, " + 
wordCount + " words");
+            System.out.println("Referrer Hash : " + referrerHash);
+            System.out.println("Quality       : " + quality);
+            System.out.println("Language      : " + language);
+            System.out.println("DocType       : " + doctype);
+            System.out.println();
+        }
     } // class Entry
 
     public class kiter implements Iterator {

Modified: trunk/source/de/anomic/plasma/plasmaWordIndexEntity.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaWordIndexEntity.java    2006-03-10 
13:57:30 UTC (rev 1873)
+++ trunk/source/de/anomic/plasma/plasmaWordIndexEntity.java    2006-03-10 
16:28:01 UTC (rev 1874)
@@ -149,6 +149,10 @@
     
     public boolean addEntry(plasmaWordIndexEntry entry) throws IOException {
         if (entry == null) return false;
+        plasmaWordIndexEntry oldEntry = getEntry(entry.getUrlHash());
+        if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more 
recent Entry is already in this entity
+            return false;
+        }
         return (theIndex.put(entry.getUrlHash().getBytes(), 
entry.toEncodedForm().getBytes()) == null);
     }
     
@@ -426,4 +430,4 @@
         return conj;
     }
 */
-}
\ No newline at end of file
+}

Modified: trunk/source/de/anomic/plasma/plasmaWordIndexEntry.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaWordIndexEntry.java     2006-03-10 
13:57:30 UTC (rev 1873)
+++ trunk/source/de/anomic/plasma/plasmaWordIndexEntry.java     2006-03-10 
16:28:01 UTC (rev 1874)
@@ -413,7 +413,25 @@
     public String getLanguage() { return new String(language); }
     public char getType() { return doctype; }
     public boolean isLocal() { return localflag == LT_LOCAL; }
-    
+
+    public boolean isNewer(plasmaWordIndexEntry other) {
+        if (other == null) return true;
+        if (this.lastModified > other.lastModified) return true;
+        if (this.lastModified == other.getLastModified()) {
+            if (this.quality > other.quality) return true;
+        }
+        return false;
+    }
+ 
+    public boolean isOlder(plasmaWordIndexEntry other) {
+        if (other == null) return false;
+        if (this.lastModified < other.getLastModified()) return true;
+        if (this.lastModified == other.getLastModified()) {
+            if (this.quality < other.quality) return true;
+        }
+        return false;
+    }
+
     public int domlengthNormalized() {
         return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30;
     }
@@ -424,4 +442,4 @@
         System.out.println("WORDHASH: " + word2hash(args[0]));
     }
    
-}
\ No newline at end of file
+}

Modified: trunk/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java    
2006-03-10 13:57:30 UTC (rev 1873)
+++ trunk/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java    
2006-03-10 16:28:01 UTC (rev 1874)
@@ -132,6 +132,10 @@
 
     private boolean addi(plasmaWordIndexEntry entry) {
         // returns true if the new entry was added, false if it already existet
+        plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) 
container.get(entry.getUrlHash());
+        if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more 
recent Entry is already in this container
+            return false;
+        }
         return (container.put(entry.getUrlHash(), entry) == null);
     }
 

_______________________________________________
YaCy-svn mailing list
[email protected]
http://lists.berlios.de/mailman/listinfo/yacy-svn

Antwort per Email an