Author: j16sdiz Date: 2008-12-10 09:08:25 +0000 (Wed, 10 Dec 2008) New Revision: 24174
Added: trunk/plugins/XMLSpider/Version.java Modified: trunk/plugins/XMLSpider/XMLSpider.java Log: initial db4o support Added: trunk/plugins/XMLSpider/Version.java =================================================================== --- trunk/plugins/XMLSpider/Version.java (rev 0) +++ trunk/plugins/XMLSpider/Version.java 2008-12-10 09:08:25 UTC (rev 24174) @@ -0,0 +1,15 @@ +/* This code is part of Freenet. It is distributed under the GNU General + * Public License, version 2 (or at your option any later version). See + * http://www.gnu.org/ for further details of the GPL. */ +package plugins.XMLSpider; + +/** + * Necessary to be able to use pluginmanager's versions + */ +public class Version { + private static final String svnRevision = "@custom@"; + + static String getSvnRevision() { + return svnRevision; + } +} Modified: trunk/plugins/XMLSpider/XMLSpider.java =================================================================== --- trunk/plugins/XMLSpider/XMLSpider.java 2008-12-10 07:43:04 UTC (rev 24173) +++ trunk/plugins/XMLSpider/XMLSpider.java 2008-12-10 09:08:25 UTC (rev 24174) @@ -18,11 +18,12 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.Vector; +import java.util.concurrent.atomic.AtomicLong; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; @@ -37,6 +38,14 @@ import org.w3c.dom.Element; import org.w3c.dom.Text; +import com.db4o.Db4o; +import com.db4o.ObjectContainer; +import com.db4o.ObjectSet; +import com.db4o.config.Configuration; +import com.db4o.config.QueryEvaluationMode; +import com.db4o.diagnostic.DiagnosticToConsole; +import com.db4o.query.Query; + import freenet.client.ClientMetadata; import freenet.client.FetchContext; import freenet.client.FetchException; @@ -58,6 +67,7 @@ import freenet.pluginmanager.FredPluginHTTP; import freenet.pluginmanager.FredPluginHTTPAdvanced; import freenet.pluginmanager.FredPluginThreadless; +import freenet.pluginmanager.FredPluginVersioned; import freenet.pluginmanager.PluginHTTPException; import freenet.pluginmanager.PluginRespirator; import freenet.support.Logger; @@ -74,59 +84,67 @@ * @author swati goyal * */ -public class XMLSpider implements FredPlugin, FredPluginHTTP, FredPluginThreadless, FredPluginHTTPAdvanced, USKCallback { +public class XMLSpider implements FredPlugin, FredPluginHTTP, FredPluginThreadless, FredPluginVersioned, + FredPluginHTTPAdvanced, USKCallback { + static enum Status { + /** For simplicity, running is also mark as QUEUED */ + QUEUED, SUCCESSED, FAILED + }; + + static class Page { + /** Page Id */ + long id; + /** URI of the page */ + String uri; + /** Title */ + String pageTitle; + /** Status */ + Status status = Status.QUEUED; + /** Queued Time */ + long lastChange = System.currentTimeMillis(); + /** Document Frequency (DF) */ + long df; + @Override + public int hashCode() { + return (int) (id ^ (id >>> 32)); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + return id == ((Page) obj).id; + } + + @Override + public String toString() { + return "[PAGE: id=" + id + ", title=" + pageTitle + ", uri=" + uri + ", status=" + status + ", df=" + df + + "]"; + } + } + + /** Document ID of fetching documents */ + protected Map<Page, ClientGetter> runningFetch = new HashMap<Page, ClientGetter>(); + long tProducedIndex; /** * Stores the found words along with md5 */ public TreeMap<String, String> tMap = new TreeMap<String, String>(); - long count; - // URIs visited, or fetching, or queued. Added once then forgotten about. - /** - * - * Lists the uris that have been vistied by the spider - */ - public final HashSet<FreenetURI> visitedURIs = new HashSet<FreenetURI>(); + protected AtomicLong maxId; + private final HashSet<Long> idsWithWords = new HashSet<Long>(); - /** - * Lists the uris that were visited but failed. - */ - public final HashSet<FreenetURI> failedURIs = new HashSet<FreenetURI>(); - - private final HashSet<FreenetURI> queuedURISet = new HashSet<FreenetURI>(); - /** - * - * Lists the uris that are still queued. - * - * Since we have limited RAM, and we don't want stuff to be on the cooldown queue for a - * long period, we use 2 retries (to stay off the cooldown queue), and we go over the queued - * list 3 times for each key. - */ - public final LinkedList<FreenetURI>[] queuedURIList = new LinkedList[] { new LinkedList<FreenetURI>(), - new LinkedList<FreenetURI>(), new LinkedList<FreenetURI>() }; - private final HashMap<FreenetURI, ClientGetter> runningFetchesByURI = new HashMap<FreenetURI, ClientGetter>(); - + private final HashMap<String, Long[]> idsByWord = new HashMap<String, Long[]>(); - - private final HashMap<Long, String> titlesOfIds = new HashMap<Long, String>(); - private final HashMap<FreenetURI, Long> uriIds = new HashMap<FreenetURI, Long>(); - private final HashMap<Long, FreenetURI> idUris = new HashMap<Long, FreenetURI>(); - // Re-enable outlinks/inlinks when we publish them or use them for ranking. - /** - * Lists the outlinks from a particular page, - * </br> indexed by the id of page uri - */ -// public final HashMap outlinks = new HashMap(); - /** - * Lists the inlinks to a particular page, - * indexed by the id of page uri. - */ -// public final HashMap inlinks = new HashMap(); private Vector<String> indices; private int match; - private Long id; private long time_taken; /* * minTimeBetweenEachIndexRewriting in seconds @@ -143,8 +161,13 @@ public Set<String> allowedMIMETypes; private static final int MAX_ENTRIES = 2000; private static final long MAX_SUBINDEX_UNCOMPRESSED_SIZE = 4*1024*1024; - private static int version = 32; - private static final String pluginName = "XML spider "+version; + private static int version = 33; + private static final String pluginName = "XML spider " + version; + + public String getVersion() { + return version + " r" + Version.getSvnRevision(); + } + /** * Gives the allowed fraction of total time spent on generating indices with * maximum value = 1; minimum value = 0. @@ -186,81 +209,86 @@ catch(Exception e){} } - if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) { - queuedURIList[0].addLast(uri); - visitedURIs.add(uri); - uriIds.put(uri, id); - idUris.put(id, uri); - id++; + if (getPageByURI(uri) == null) { + Page page = new Page(); + page.uri = uri.toString(); + page.id = maxId.incrementAndGet(); + + db.store(page); } } private void startSomeRequests() { - - FreenetURI[] initialURIs = core.getBookmarkURIs(); for (int i = 0; i < initialURIs.length; i++) - { queueURI(initialURIs[i]); - } ArrayList<ClientGetter> toStart = null; synchronized (this) { if (stopped) { return; } - int running = runningFetchesByURI.size(); - int queued = queuedURIList[0].size() + queuedURIList[1].size() + queuedURIList[2].size(); + int running = runningFetch.size(); + + Query query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.QUEUED); + query.descend("lastChange").orderAscending(); + ObjectSet<Page> queuedSet = query.execute(); - if ((running >= maxParallelRequests) || (queued == 0)) + if ((running >= maxParallelRequests) || (queuedSet.size() - running <= 0)) return; - toStart = new ArrayList<ClientGetter>(Math.min(maxParallelRequests - running, queued)); + toStart = new ArrayList<ClientGetter>(maxParallelRequests - running); for (int i = running; i < maxParallelRequests; i++) { - boolean found = false; - for(int j=0;j<queuedURIList.length;j++) { - if(queuedURIList[j].isEmpty()) continue; - FreenetURI uri = (FreenetURI) queuedURIList[j].removeFirst(); - if(j == queuedURIList.length) queuedURISet.remove(uri); - ClientGetter getter = makeGetter(uri, j); + if (!queuedSet.hasNext()) + break; + + Page page = queuedSet.next(); + if (runningFetch.containsKey(page)) + continue; + + try { + ClientGetter getter = makeGetter(page, 0); + + Logger.minor(this, "Starting " + getter + " " + page); toStart.add(getter); - found = true; - break; + runningFetch.put(page, getter); + } catch (MalformedURLException e) { + Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); + + page.status = Status.FAILED; + page.lastChange = System.currentTimeMillis(); + db.store(page); } - if(!found) break; } } - for (int i = 0; i < toStart.size(); i++) { - - ClientGetter g = toStart.get(i); + + for (ClientGetter g : toStart) { try { - runningFetchesByURI.put(g.getURI(), g); g.start(); + Logger.minor(this, g + " started"); } catch (FetchException e) { - onFailure(e, g, ((MyClientCallback)g.getClientCallback()).tries); + Logger.error(this, "Fetch Exception: " + g, e); + onFailure(e, g, ((MyClientCallback) g.getClientCallback()).page, ((MyClientCallback) g + .getClientCallback()).tries); } } } - private final ClientCallback[] clientCallbacks = - new ClientCallback[] { - new MyClientCallback(0), - new MyClientCallback(1), - new MyClientCallback(2) - }; private class MyClientCallback implements ClientCallback { - + final Page page; final int tries; - public MyClientCallback(int x) { - tries = x; - // TODO Auto-generated constructor stub + public MyClientCallback(Page page, int tries) { + this.page = page; + this.tries = tries; } public void onFailure(FetchException e, ClientGetter state) { - XMLSpider.this.onFailure(e, state, tries); + XMLSpider.this.onFailure(e, state, page, tries); } public void onFailure(InsertException e, BaseClientPutter state) { @@ -280,51 +308,52 @@ } public void onSuccess(FetchResult result, ClientGetter state) { - XMLSpider.this.onSuccess(result, state); + XMLSpider.this.onSuccess(result, state, page); } public void onSuccess(BaseClientPutter state) { // Ignore } + public String toString() { + return super.toString() + ":" + "tries=" + tries + ",page=" + page; + } } - private ClientGetter makeGetter(FreenetURI uri, int retries) { - ClientGetter g = new ClientGetter(clientCallbacks[retries], core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, uri, ctx, PRIORITY_CLASS, this, null, null); - return g; + private ClientGetter makeGetter(Page page, int tries) throws MalformedURLException { + ClientGetter getter = new ClientGetter(new MyClientCallback(page, tries), + core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, + new FreenetURI(page.uri), ctx, PRIORITY_CLASS, this, null, null); + return getter; } + /** * Processes the successfully fetched uri for further outlinks. * * @param result * @param state + * @param page */ - public void onSuccess(FetchResult result, ClientGetter state) { + public void onSuccess(FetchResult result, ClientGetter state, Page page) { FreenetURI uri = state.getURI(); + page.status = Status.SUCCESSED; // Content filter may throw, but we mark it as success anyway try { - ClientMetadata cm = result.getMetadata(); Bucket data = result.asBucket(); - String mimeType = cm.getMIMEType(); + String mimeType = cm.getMIMEType(); - Long id; - synchronized(this) { - id = uriIds.get(uri); -// inlinks.put(page.id, new Vector()); -// outlinks.put(page.id, new Vector()); - } /* * instead of passing the current object, the pagecallback object for every page is passed to the content filter * this has many benefits to efficiency, and allows us to identify trivially which page is being indexed. * (we CANNOT rely on the base href provided). */ - PageCallBack page = new PageCallBack(id); - Logger.minor(this, "Successful: "+uri+" : "+page.id); + PageCallBack pageCallBack = new PageCallBack(page); + Logger.minor(this, "Successful: " + uri + " : " + page.id); try { - Logger.minor(this, "Filtering "+uri+" : "+page.id); - ContentFilter.filter(data, new NullBucketFactory(), mimeType, uri.toURI("http://127.0.0.1:8888/"), page); + ContentFilter.filter(data, new NullBucketFactory(), mimeType, uri.toURI("http://127.0.0.1:8888/"), pageCallBack); + Logger.minor(this, "Filtered " + uri + " : " + page.id); } catch (UnsafeContentTypeException e) { return; // Ignore } catch (IOException e) { @@ -336,27 +365,51 @@ } } finally { synchronized (this) { - runningFetchesByURI.remove(uri); + runningFetch.remove(page.id); + page.lastChange = System.currentTimeMillis(); + db.store(page); } startSomeRequests(); } } - public void onFailure(FetchException e, ClientGetter state, int tries) { - FreenetURI uri = state.getURI(); - Logger.minor(this, "Failed: "+uri+" : "+e); + public void onFailure(FetchException fe, ClientGetter state, Page page, int tries) { + Logger.minor(this, "Failed: [" + tries + "] " + page + " : " + fe, fe); synchronized (this) { - runningFetchesByURI.remove(uri); - failedURIs.add(uri); - tries++; - if(tries < queuedURIList.length && !e.isFatal()) - queuedURIList[tries].addLast(uri); + if (fe.newURI != null) { + // redirect, mark as successed + queueURI(fe.newURI); + + runningFetch.remove(page); + page.status = Status.SUCCESSED; + page.lastChange = System.currentTimeMillis(); + db.store(page); + } else if (fe.isFatal() || tries > 3) { + // too many tries or fatal, mark as failed + runningFetch.remove(page.id); + page.status = Status.FAILED; + page.lastChange = System.currentTimeMillis(); + db.store(page); + } else if (!stopped) { + // Retry + // FIXME why? the original version say this keep off the cooldown queue + ClientGetter getter = null; + try { + getter = makeGetter(page, tries + 1); + getter.start(); + runningFetch.put(page, getter); + } catch (MalformedURLException e) { + Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); + } catch (FetchException e) { + onFailure(e, getter, ((MyClientCallback) getter.getClientCallback()).page, + ((MyClientCallback) getter.getClientCallback()).tries); + } + } } - if (e.newURI != null) - queueURI(e.newURI); - startSomeRequests(); + if (!stopped) + startSomeRequests(); } /** @@ -540,7 +593,7 @@ Logger.minor(this, "Generating subindex for "+list.size()+" entries with prefix length "+p); try { - if(list.size() > 0 && list.size() < MAX_ENTRIES) + if (list.size() < MAX_ENTRIES) { generateXML(list,p); return; @@ -641,6 +694,9 @@ Logger.error(this, "Eh?"); continue; } + + Page page = getPageById(id); + /* * adding file information * uriElement - lists the id of the file containing a particular word @@ -650,11 +706,8 @@ Element fileElement = xmlDoc.createElement("file"); uriElement.setAttribute("id", x.toString()); fileElement.setAttribute("id", x.toString()); - fileElement.setAttribute("key",(idUris.get(id)).toString()); - if(titlesOfIds.containsKey(id)) - fileElement.setAttribute("title",(titlesOfIds.get(id)).toString()); - else - fileElement.setAttribute("title",(idUris.get(id)).toString()); + fileElement.setAttribute("key", page.uri); + fileElement.setAttribute("title", page.pageTitle != null ? page.pageTitle : page.uri); /* Position by position */ @@ -714,13 +767,6 @@ Logger.minor(this, "Spider: indexes regenerated."); } - /** - * @see freenet.oldplugins.plugin.Plugin#getPluginName() - */ - public String getPluginName() { - return pluginName; - } - private static String convertToHex(byte[] data) { StringBuilder buf = new StringBuilder(); for (int i = 0; i < data.length; i++) { @@ -870,14 +916,14 @@ public void terminate(){ synchronized (this) { stopped = true; - for(int i=0;i<queuedURIList.length;i++) - queuedURIList[i].clear(); + for (Map.Entry<Page, ClientGetter> me : runningFetch.entrySet()) { + me.getValue().cancel(); + } } } public void runPlugin(PluginRespirator pr){ this.pr = pr; - this.id = 0L; this.core = pr.getNode().clientCore; /* Initialize Fetch Context */ @@ -894,12 +940,26 @@ tProducedIndex = System.currentTimeMillis(); stopped = false; - count = 0; if (!new File(DEFAULT_INDEX_DIR).mkdirs()) { Logger.error(this, "Could not create default index directory "); } - //startPlugin(); + + // Initial DB4O + db = initDB4O(); + + // Find max Page ID + { + Query query = db.query(); + query.constrain(Page.class); + query.descend("id").orderDescending(); + ObjectSet<Page> set = query.execute(); + if (set.hasNext()) + maxId = new AtomicLong(set.next().id); + else + maxId = new AtomicLong(0); + } + pr.getNode().executor.execute(new Runnable() { public void run() { try{ @@ -922,7 +982,7 @@ { appendDefaultHeader(out,null); out.append("<p><h4>"+listname+" URIs</h4></p>"); - appendList(listname,out,null); + appendList(listname, out); return out.toString(); } appendDefaultPageStart(out,null); @@ -932,8 +992,18 @@ try { FreenetURI uri = new FreenetURI(uriParam); synchronized (this) { - failedURIs.remove(uri); - visitedURIs.remove(uri); + // Check if queued already + Page page = getPageByURI(uri); + if (page != null) { + // We have no reliable way to stop a request, + // requeue only if it is successed / failed + if (page.status == Status.SUCCESSED || page.status == Status.FAILED) { + page.lastChange = System.currentTimeMillis(); + page.status = Status.QUEUED; + + db.store(page); + } + } } out.append("<p>URI added :"+uriParam+"</p>"); queueURI(uri); @@ -947,20 +1017,39 @@ /* * List the visited, queued, failed and running fetches on the web interface */ - private synchronized void appendList(String listname, StringBuilder out, String stylesheet) - { - Iterator<FreenetURI> it = (runningFetchesByURI.keySet()).iterator(); - if(listname.equals("running")) - it = (runningFetchesByURI.keySet()).iterator(); - if(listname.equals("visited")) - it = (new HashSet<FreenetURI>(visitedURIs)).iterator(); - if(listname.startsWith("queued")) - it = (new ArrayList<FreenetURI>(queuedURIList[Integer.parseInt(listname.substring("queued".length()))])) - .iterator(); - if(listname.equals("failed")) - it = (new HashSet<FreenetURI>(failedURIs)).iterator(); - while(it.hasNext()) - out.append("<code>"+it.next().toString()+"</code><br/>"); + private synchronized void appendList(String listname, StringBuilder out) { + Iterable<Page> it = runningFetch.keySet(); + + if (listname.equals("running")) { + it = runningFetch.keySet(); + } else if (listname.equals("visited")) { + Query query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.SUCCESSED); + query.descend("lastChange").orderAscending(); + ObjectSet<Page> set = query.execute(); + + it = set; + } else if (listname.equals("queued")) { + Query query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.QUEUED); + query.descend("lastChange").orderAscending(); + ObjectSet<Page> set = query.execute(); + + it = set; + } else if (listname.equals("failed")) { + Query query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.FAILED); + query.descend("lastChange").orderAscending(); + ObjectSet<Page> set = query.execute(); + + it = set; + } + + for (Page page : it) + out.append("<code>" + page.uri + "</code><br/>"); } private void appendDefaultPageStart(StringBuilder out, String stylesheet) { @@ -973,42 +1062,79 @@ out.append("Add uri:"); out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\" /><br/><br/>"); out.append("<input type=\"submit\" value=\"Add uri\" /></form>"); - Set<FreenetURI> runningFetches; - Set<FreenetURI> visited; - Set<FreenetURI> failed; - List<FreenetURI>[] queued = new List[queuedURIList.length]; + List<Page> runningFetchesSnapshot; + long runningFetchesSnapshotSize; + List<Page> visitedSnapshot; + long visitedSnapshotSize; + List<Page> failedSnapshot; + long failedSnapshotSize; + List<Page> queuedSnapshot; + long queuedSnapshotSize; + synchronized(this) { - visited = new HashSet<FreenetURI>(visitedURIs); - failed = new HashSet<FreenetURI>(failedURIs); - for(int i=0;i<queuedURIList.length;i++) - queued[i] = new ArrayList<FreenetURI>(queuedURIList[i]); - runningFetches = new HashSet<FreenetURI>(runningFetchesByURI.keySet()); + runningFetchesSnapshot = new ArrayList<Page>(maxShownURIs); + { + Iterator<Page> it = this.runningFetch.keySet().iterator(); + for (int i = 0; it.hasNext() && i < maxShownURIs; i++) + runningFetchesSnapshot.add(it.next()); + runningFetchesSnapshotSize = runningFetch.size(); + } + + visitedSnapshot = new ArrayList<Page>(maxShownURIs); + Query query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.SUCCESSED); + query.descend("lastChange").orderAscending(); + ObjectSet<Page> set = query.execute(); + for (int i = 0; set.hasNext() && i < maxShownURIs; i++) + visitedSnapshot.add(set.next()); + visitedSnapshotSize = set.size(); + + failedSnapshot = new ArrayList<Page>(maxShownURIs); + query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.FAILED); + query.descend("lastChange").orderAscending(); + set = query.execute(); + for (int i = 0; set.hasNext() && i < maxShownURIs; i++) + failedSnapshot.add(set.next()); + failedSnapshotSize = set.size(); + + queuedSnapshot = new ArrayList<Page>(maxShownURIs); + query = db.query(); + query.constrain(Page.class); + query.descend("status").constrain(Status.QUEUED); + query.descend("lastChange").orderAscending(); + set = query.execute(); + for (int i = 0; set.hasNext() && i < maxShownURIs; i++) + queuedSnapshot.add(set.next()); + queuedSnapshotSize = set.size(); } + out.append("<p><h3>Running Fetches</h3></p>"); - out.append("<br/>Size :"+runningFetches.size()+"<br/>"); - appendList(runningFetches,out,stylesheet); + out.append("<br/>Size :" + runningFetchesSnapshotSize + "<br/>"); + for (Page page : runningFetchesSnapshot) + out.append("<code>" + page.uri + "</code><br/>"); out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>"); - for(int j=0;j<queued.length;j++) { - out.append("<p><h3>Queued URIs ("+j+")</h3></p>"); - out.append("<br/>Size :"+queued[j].size()+"<br/>"); - int i = 0; - Iterator<FreenetURI> it = queued[j].iterator(); - while(it.hasNext()){ - if(i<=maxShownURIs){ - out.append("<code>"+it.next().toString()+"</code><br/>"); - } - else break; - i++; - } - out.append("<p><a href=\"?list="+"queued"+j+"\">Show all</a><br/></p>"); - } + + + out.append("<p><h3>Queued URIs</h3></p>"); + out.append("<br/>Size :" + queuedSnapshotSize + "<br/>"); + for (Page page : queuedSnapshot) + out.append("<code>" + page.uri + "</code><br/>"); + out.append("<p><a href=\"?list=\">Show all</a><br/></p>"); + + out.append("<p><h3>Visited URIs</h3></p>"); - out.append("<br/>Size :"+visited.size()+"<br/>"); - appendList(visited,out,stylesheet); + out.append("<br/>Size :" + visitedSnapshotSize + "<br/>"); + for (Page page : visitedSnapshot) + out.append("<code>" + page.uri + "</code><br/>"); out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>"); + out.append("<p><h3>Failed URIs</h3></p>"); - out.append("<br/>Size :"+failed.size()+"<br/>"); - appendList(failed,out,stylesheet); + out.append("<br/>Size :" + failedSnapshotSize + "<br/>"); + for (Page page : failedSnapshot) + out.append("<code>" + page.uri + "</code><br/>"); out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>"); out.append("<p>Time taken in generating index = "+time_taken+"</p>"); } @@ -1025,21 +1151,6 @@ out.append("<input type=\"submit\" value=\"Add uri\" /></form>"); } - - private void appendList(Set<FreenetURI> list, StringBuilder out, String stylesheet) { - Iterator<FreenetURI> it = list.iterator(); - int i = 0; - while(it.hasNext()){ - if(i<=maxShownURIs){ - out.append("<code>"+it.next().toString()+"</code><br/>"); - } - else{ - break; - } - i++; - } - } - /** * creates the callback object for each page. *<p>Used to create inlinks and outlinks for each page separately. @@ -1047,12 +1158,10 @@ * */ public class PageCallBack implements FoundURICallback{ - final Long id; - /* - * id of the page as refrenced in uriIds - */ - PageCallBack(Long i) { - id = i; + final Page page; + + PageCallBack(Page page) { + this.page = page; } public void foundURI(FreenetURI uri){ @@ -1061,7 +1170,7 @@ public void foundURI(FreenetURI uri, boolean inline){ - Logger.minor(this, "foundURI "+uri+" on "+id); + Logger.minor(this, "foundURI " + uri + " on " + page); queueURI(uri); // FIXME re-enable outlinks/inlinks when we can do something useful with them // synchronized(XMLSpider.this) { @@ -1105,14 +1214,14 @@ public void onText(String s, String type, URI baseURI){ - Logger.minor(this, "onText on "+id+" ("+baseURI+")"); + Logger.minor(this, "onText on " + page.id + " (" + baseURI + ")"); if((type != null) && (type.length() != 0) && type.toLowerCase().equals("title") && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 0)) { /* * title of the page */ - titlesOfIds.put(id, s); + page.pageTitle = s; type = "title"; } else type = null; @@ -1122,7 +1231,7 @@ */ String[] words = s.split("[^\\p{L}\\{N}]"); Integer lastPosition = null; - lastPosition = lastPositionById.get(id); + lastPosition = lastPositionById.get(page.id); if(lastPosition == null) lastPosition = 1; @@ -1134,16 +1243,16 @@ word = word.intern(); try{ if(type == null) - addWord(word, lastPosition.intValue() + i, id); + addWord(word, lastPosition.intValue() + i, page.id); else - addWord(word, -1 * (i+1), id); + addWord(word, -1 * (i + 1), page.id); } catch (Exception e){} } if(type == null) { lastPosition = lastPosition + words.length; - lastPositionById.put(id, lastPosition); + lastPositionById.put(page.id, lastPosition); } } @@ -1238,7 +1347,6 @@ private void scheduleMakeIndex() { core.getTicker().queueTimedJob(new PrioRunnable() { - public void run() { try { makeIndex(); @@ -1278,4 +1386,57 @@ public short getPollingPriorityProgress() { return PRIORITY_CLASS; } + + /** + * Initializes DB4O. + * + * @return db4o's connector + */ + protected ObjectContainer db; + + private ObjectContainer initDB4O() { + Configuration cfg = Db4o.newConfiguration(); + + //- Page + cfg.objectClass(Page.class).objectField("id").indexed(true); + cfg.objectClass(Page.class).objectField("uri").indexed(true); + cfg.objectClass(Page.class).objectField("status").indexed(true); + cfg.objectClass(Page.class).objectField("lastChange").indexed(true); + + cfg.objectClass(Page.class).cascadeOnActivate(true); + cfg.objectClass(Page.class).cascadeOnUpdate(true); + cfg.objectClass(Page.class).cascadeOnDelete(true); + + //- Other + cfg.activationDepth(4); + cfg.updateDepth(4); + cfg.queries().evaluationMode(QueryEvaluationMode.LAZY); + cfg.diagnostic().addListener(new DiagnosticToConsole()); + + return Db4o.openFile(cfg, "XMLSpider-" + version + ".db4o"); + } + + protected Page getPageByURI(FreenetURI uri) { + Query query = db.query(); + query.constrain(Page.class); + query.descend("uri").constrain(uri.toString()); + ObjectSet<Page> set = query.execute(); + + if (set.hasNext()) + return set.next(); + else + return null; + } + + protected Page getPageById(long id) { + Query query = db.query(); + query.constrain(Page.class); + query.descend("id").constrain(id); + ObjectSet<Page> set = query.execute(); + + if (set.hasNext()) + return set.next(); + else + return null; + } } _______________________________________________ cvs mailing list [email protected] http://emu.freenetproject.org/cgi-bin/mailman/listinfo/cvs
