Author: thorsten Date: Mon Jan 14 07:55:34 2008 New Revision: 611834 URL: http://svn.apache.org/viewvc?rev=611834&view=rev Log: Wrapping up first working version based on spring. The API is mainly the same as before only small extension point specific code has been dropped.
Added: labs/droids/trunk/src/core/java/org/apache/droids/api/Handler.java (with props) labs/droids/trunk/src/core/java/org/apache/droids/handle/ labs/droids/trunk/src/core/java/org/apache/droids/handle/Save.java (with props) labs/droids/trunk/src/core/java/org/apache/droids/handle/Sysout.java (with props) labs/droids/trunk/src/core/java/org/apache/droids/handle/WriterHandler.java (with props) labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/HandlerFactory.java (with props) Modified: labs/droids/trunk/src/core/java/org/apache/droids/Core.java labs/droids/trunk/src/core/java/org/apache/droids/DefaultCrawler.java labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java labs/droids/trunk/src/core/java/org/apache/droids/api/Parser.java labs/droids/trunk/src/core/java/org/apache/droids/api/Task.java labs/droids/trunk/src/core/java/org/apache/droids/api/Worker.java labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/GenericFactory.java labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/URLFiltersFactory.java labs/droids/trunk/src/core/java/org/apache/droids/parse/Outlink.java labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueBean.java labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueLink.java labs/droids/trunk/src/core/java/org/apache/droids/queue/Simple.java Modified: labs/droids/trunk/src/core/java/org/apache/droids/Core.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/Core.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/Core.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/Core.java Mon Jan 14 07:55:34 2008 @@ -18,6 +18,7 @@ import org.apache.droids.api.Droid; import org.apache.droids.helper.factories.DroidFactory; +import org.apache.droids.helper.factories.HandlerFactory; import org.apache.droids.helper.factories.ParserFactory; import org.apache.droids.helper.factories.ProtocolFactory; import org.apache.droids.helper.factories.URLFiltersFactory; @@ -39,6 +40,8 @@ private ProtocolFactory protocolFactory; private URLFiltersFactory filtersFactory; + + private HandlerFactory handlerFactory; public ProtocolFactory getProtocolFactory() { return protocolFactory; @@ -73,6 +76,14 @@ public void setFiltersFactory(URLFiltersFactory filtersFactory) { this.filtersFactory = filtersFactory; + } + + public HandlerFactory getHandlerFactory() { + return handlerFactory; + } + + public void setHandlerFactory(HandlerFactory handlerFactory) { + this.handlerFactory = handlerFactory; } Modified: labs/droids/trunk/src/core/java/org/apache/droids/DefaultCrawler.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/DefaultCrawler.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/DefaultCrawler.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/DefaultCrawler.java Mon Jan 14 07:55:34 2008 @@ -38,6 +38,12 @@ private ConcurrentHashMap<Integer,Worker> runningWorker; + private int x=0; + + private synchronized void increment(){ + x++; + } + public void run() { runningThreads = 0; taskDate = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System @@ -47,11 +53,14 @@ new LinkedBlockingQueue<Runnable>()); runningWorker=new ConcurrentHashMap<Integer,Worker>(); initQueue(); - int x = 0; + doWork(0); + Core.threadMessage("Finshed invocation, waiting for workers to finish."); + } + + private void doWork(int i) { while (queue.hasNext()) { - x = startWorkers(x); + startWorkers(x); } - Core.threadMessage("Finshed invocation, waiting for workers to finish."); } private synchronized int startWorkers(int x) { @@ -59,7 +68,7 @@ worker.setId(x); runningWorker.put(x,worker); pool.execute(worker); - x++; + increment(); try { Core.threadMessage("suspending"); Thread.sleep(4000); @@ -70,7 +79,7 @@ } public synchronized void initQueue() { - QueueLink initialLink = new QueueLink(url, taskDate); + QueueLink initialLink = new QueueLink(url, taskDate,0); queue.init((Task[])new Task[] {initialLink}); } public synchronized Worker getWorker() { @@ -155,12 +164,16 @@ } public synchronized void finishedWorker(int id) { - pool.remove(runningWorker.get(id)); + Worker worker = runningWorker.get(id); + int y = worker.getDepth()+1; + pool.remove(worker); runningWorker.remove(id); Core.threadMessage("Worker \""+id+"\" has finished."); if (runningWorker.size()==0 & !queue.hasNext()){ shutdownAndAwaitTermination(); Core.threadMessage("All threads has finished."); + }else if(queue.hasNext()){ + doWork(y); } } Modified: labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java Mon Jan 14 07:55:34 2008 @@ -1,5 +1,7 @@ package org.apache.droids; +import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; @@ -10,6 +12,7 @@ import org.apache.droids.api.Queue; import org.apache.droids.api.Task; import org.apache.droids.api.Worker; +import org.apache.droids.helper.factories.HandlerFactory; import org.apache.droids.helper.factories.ParserFactory; import org.apache.droids.helper.factories.ProtocolFactory; import org.apache.droids.helper.factories.URLFiltersFactory; @@ -35,6 +38,10 @@ private String uri; private URLFiltersFactory filtersFactory; + + private HandlerFactory handlerFactory; + + private int depth; public synchronized void run() { Core.threadMessage("Starting " + this.getClass().getCanonicalName()); @@ -47,8 +54,7 @@ parser = parserFactory.getParser(contentType); // parse contains the outlinks and can be used later Parse parse = getParse(); - // if no parser is found we do not extract links - + handle(parse); droid.finishedWorker(id); } catch (Exception e) { e.printStackTrace(); @@ -56,41 +62,51 @@ } + private void handle(Parse parse) throws MalformedURLException, IOException { + if (null != parse) + handlerFactory.handle(protocol.openStream(uri), new URL(uri), parse); + } + private Parse getParse() { Parse parse =null; if (null != parser) { try { // extract links - parse = parser.getParse(protocol.openStream(uri), new URL(uri)); + parse = parser.getParse(protocol.openStream(uri), link); // all links from the page unfiltered filter(parse); } catch (Exception e) { Core.threadMessage(e.getMessage()); } } - // TODO Auto-generated method stub - return null; + return parse; } private void filter(Parse parse) { + Outlink[] filterLinks = filterLinks(parse); + queue.merge(filterLinks); + } + + private Outlink[] filterLinks(Parse parse) { + // filter the link Outlink[] links = parse.getData().getOutlinks(); // new cleaned list ArrayList<Outlink> filtered = new ArrayList<Outlink>(); for (int i = 0; i < links.length; i++) { Outlink outlink = links[i]; - String test = filtersFactory.filter(outlink.getToUrl()); - if (null != test & !filtered.contains(outlink)) { + if (filtersFactory.accept(outlink.getToUrl()) & !filtered.contains(outlink)) { filtered.add(outlink); } } // this are the links we need to follow Outlink[] filterLinks = filtered.toArray(new Outlink[filtered.size()]); - queue.merge(filterLinks); + return filterLinks; } public void setQueue(Queue queue) { this.queue=queue; link = queue.next(); + depth=link.getDepth(); } public void setDroid(Droid droid) { @@ -98,9 +114,18 @@ protocolFactory = droid.getCore().getProtocolFactory(); parserFactory=droid.getCore().getParserFactory(); filtersFactory=droid.getCore().getFiltersFactory(); + handlerFactory=droid.getCore().getHandlerFactory(); } public void setId(int x) { id=x; + } + + public int getDepth() { + return depth; + } + + public void setDepth(int x) { + depth=x; } } Added: labs/droids/trunk/src/core/java/org/apache/droids/api/Handler.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/api/Handler.java?rev=611834&view=auto ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/api/Handler.java (added) +++ labs/droids/trunk/src/core/java/org/apache/droids/api/Handler.java Mon Jan 14 07:55:34 2008 @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.droids.api; + +import java.io.InputStream; +import java.net.URL; + +public interface Handler{ + public void handle(InputStream openStream, URL url, Parse parse) throws Exception; +} Propchange: labs/droids/trunk/src/core/java/org/apache/droids/api/Handler.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: labs/droids/trunk/src/core/java/org/apache/droids/api/Parser.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/api/Parser.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/api/Parser.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/api/Parser.java Mon Jan 14 07:55:34 2008 @@ -21,5 +21,5 @@ public interface Parser{ /** Creates the parse for some content. */ - Parse getParse(InputStream stream, URL base); + Parse getParse(InputStream openStream, Task link); } Modified: labs/droids/trunk/src/core/java/org/apache/droids/api/Task.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/api/Task.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/api/Task.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/api/Task.java Mon Jan 14 07:55:34 2008 @@ -19,4 +19,6 @@ public interface Task { public String getId(); public String getTaskDate(); + public int getDepth(); + public void setDepth(int depth); } Modified: labs/droids/trunk/src/core/java/org/apache/droids/api/Worker.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/api/Worker.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/api/Worker.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/api/Worker.java Mon Jan 14 07:55:34 2008 @@ -5,4 +5,6 @@ public abstract void setQueue(Queue queue); public abstract void setDroid(Droid droid); public abstract void setId(int x); + public abstract void setDepth(int x); + public abstract int getDepth(); } Modified: labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml Mon Jan 14 07:55:34 2008 @@ -3,6 +3,7 @@ <beans> + <!-- Core - factories register --> <bean id="org.apache.droids.Core" class="org.apache.droids.Core"> <property name="droids" ref="org.apache.droids.helper.factories.DroidFactory"/> @@ -12,8 +13,15 @@ ref="org.apache.droids.helper.factories.ParserFactory"/> <property name="filtersFactory" ref="org.apache.droids.helper.factories.URLFiltersFactory"/> + <property name="handlerFactory" + ref="org.apache.droids.helper.factories.HandlerFactory"/> </bean> + <!-- Factories --> + <!-- FIXME: this needs to be easy overridable for custom droids + e.g. split into different files and using import + Workaround: + 1) Using your own context (copy this one or better import it. --> <bean id="org.apache.droids.helper.factories.DroidFactory" class="org.apache.droids.helper.factories.DroidFactory"> <property name="map"> @@ -50,6 +58,16 @@ </property> </bean> + <bean id="org.apache.droids.helper.factories.HandlerFactory" + class="org.apache.droids.helper.factories.HandlerFactory"> + <property name="map"> + <map> + <entry key="save" value-ref="org.apache.droids.handle.Save"/> + <!--<entry key="sysout" value-ref="org.apache.droids.handle.Sysout"/>--> + </map> + </property> + </bean> + <!-- Droids --> <bean id="default" class="org.apache.droids.DefaultCrawler"> <property name="core" ref="org.apache.droids.Core"/> <property name="queue" ref="org.apache.droids.queue.Simple"/> @@ -57,13 +75,13 @@ <property name="url" value="http://target-x.de/about.html"/> </bean> - + <!-- Queue --> <bean id="org.apache.droids.queue.Simple" class="org.apache.droids.queue.Simple"> <property name="maxDepth" value="1"/> - <property name="maxSize" value="-1"/> + <property name="maxSize" value="5"/> </bean> - + <!-- Protocol --> <bean id="org.apache.droids.protocol.http.Http" class="org.apache.droids.protocol.http.Http"> <property name="from" value="[EMAIL PROTECTED]"/> @@ -71,14 +89,15 @@ <property name="userAgent" value="DROIDS-crawler-x-m01y08"/> <property name="timeout" value="10000"/> </bean> - + <!-- Parser --> <bean id="org.apache.droids.parse.html.HtmlParser" class="org.apache.droids.parse.html.HtmlParser"/> - + <!-- Filter --> <bean id="org.apache.droids.net.RegexURLFilter" class="org.apache.droids.net.RegexURLFilter"> - <property name="file" value="/home/thorsten/src/apache/droids/trunk/regex-urlfilter.txt"> - - </property> + <property name="file" value="/home/thorsten/src/apache/droids/trunk/regex-urlfilter.txt"/> </bean> - - <!--<bean id="org.apache.droids.Job" class="org.apache.droids.Job">--> + <!-- Handler --> + <bean id="org.apache.droids.handle.Save" class="org.apache.droids.handle.Save"> + <property name="outputDir" value="/home/thorsten/src/sadesi/temp/boja2/droids/"/> + </bean> + <bean id="org.apache.droids.handle.Sysout" class="org.apache.droids.handle.Sysout"/> </beans> Added: labs/droids/trunk/src/core/java/org/apache/droids/handle/Save.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/handle/Save.java?rev=611834&view=auto ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/handle/Save.java (added) +++ labs/droids/trunk/src/core/java/org/apache/droids/handle/Save.java Mon Jan 14 07:55:34 2008 @@ -0,0 +1,61 @@ +package org.apache.droids.handle; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.io.Writer; +import java.net.URL; + +import org.apache.droids.api.Handler; +import org.apache.droids.api.Parse; + +public class Save extends WriterHandler implements Handler { + + private String outputDir; + + private URL url; + + public void handle(InputStream stream, URL url, Parse parse) throws Exception { + this.url = url; + writeOutput(stream); + } + + private void writeOutput(InputStream stream) throws IOException { + if (!url.getFile().endsWith("/")) { + Reader reader = new InputStreamReader(stream); + String file = outputDir + url.getHost() + url.getFile(); + log.info("Trying to save " + url + " to " + file); + File cache = new File(file); + createFile(cache); + Writer output = new OutputStreamWriter(new FileOutputStream(cache)); + pipe(reader, output); + } + } + + private void createFile(File cache) throws IOException { + if (!cache.isDirectory() & !cache.getAbsolutePath().endsWith("/")) { + try { + cache.createNewFile(); + } catch (Exception e) { + // if we cannot create a file that means that the parent path + // does not exists + File path = new File(cache.getParent()); + path.mkdirs(); + cache.createNewFile(); + } + } + } + + public String getOutputDir() { + return outputDir; + } + + public void setOutputDir(String outputDir) { + this.outputDir = outputDir; + } + +} Propchange: labs/droids/trunk/src/core/java/org/apache/droids/handle/Save.java ------------------------------------------------------------------------------ svn:eol-style = native Added: labs/droids/trunk/src/core/java/org/apache/droids/handle/Sysout.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/handle/Sysout.java?rev=611834&view=auto ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/handle/Sysout.java (added) +++ labs/droids/trunk/src/core/java/org/apache/droids/handle/Sysout.java Mon Jan 14 07:55:34 2008 @@ -0,0 +1,26 @@ +package org.apache.droids.handle; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.io.Writer; +import java.net.URL; + +import org.apache.droids.api.Handler; +import org.apache.droids.api.Parse; + +public class Sysout extends WriterHandler implements Handler { + + private void writeOutput(InputStream stream) throws IOException { + Reader reader = new InputStreamReader(stream); + Writer output = new OutputStreamWriter(System.out); + pipe(reader, output); + } + + public void handle(InputStream stream, URL url, Parse parse) throws Exception { + writeOutput(stream); + } + +} Propchange: labs/droids/trunk/src/core/java/org/apache/droids/handle/Sysout.java ------------------------------------------------------------------------------ svn:eol-style = native Added: labs/droids/trunk/src/core/java/org/apache/droids/handle/WriterHandler.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/handle/WriterHandler.java?rev=611834&view=auto ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/handle/WriterHandler.java (added) +++ labs/droids/trunk/src/core/java/org/apache/droids/handle/WriterHandler.java Mon Jan 14 07:55:34 2008 @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.droids.handle; + +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class WriterHandler { + + protected final Log log = LogFactory.getLog(this.getClass() + .getCanonicalName()); + + /** + * Pipes everything from the reader to the writer via a buffer + */ + protected static void pipe(Reader reader, Writer writer) throws IOException { + char[] buf = new char[1024]; + int read = 0; + while ((read = reader.read(buf)) >= 0) { + writer.write(buf, 0, read); + } + writer.flush(); + } + + public WriterHandler() { + super(); + } + +} \ No newline at end of file Propchange: labs/droids/trunk/src/core/java/org/apache/droids/handle/WriterHandler.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/GenericFactory.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/GenericFactory.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/GenericFactory.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/GenericFactory.java Mon Jan 14 07:55:34 2008 @@ -2,7 +2,13 @@ import java.util.LinkedHashMap; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + public class GenericFactory { + + protected final Log log = LogFactory.getLog(this.getClass().getCanonicalName()); + private LinkedHashMap<String, Object> map; public LinkedHashMap<String, Object> getMap() { Added: labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/HandlerFactory.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/HandlerFactory.java?rev=611834&view=auto ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/HandlerFactory.java (added) +++ labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/HandlerFactory.java Mon Jan 14 07:55:34 2008 @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.droids.helper.factories; + +import java.io.InputStream; +import java.net.URL; +import java.util.Iterator; + +import org.apache.droids.api.Handler; +import org.apache.droids.api.Parse; + +public class HandlerFactory extends GenericFactory { + + public boolean handle(InputStream stream, URL url, Parse parse) { + for (Iterator<String> iterator = getMap().keySet().iterator(); iterator + .hasNext();) { + if (stream == null) + return false; + String handlerName = iterator.next(); + Handler handler = (Handler) getMap().get(handlerName); + try { + handler.handle(stream, url, parse); + } catch (Exception e) { + log.fatal("Handler \""+handlerName + "\" has thrown an error.", e); + } + } + + return true; + } + +} Propchange: labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/HandlerFactory.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/URLFiltersFactory.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/URLFiltersFactory.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/URLFiltersFactory.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/helper/factories/URLFiltersFactory.java Mon Jan 14 07:55:34 2008 @@ -17,23 +17,40 @@ package org.apache.droids.helper.factories; import java.util.Iterator; -import java.util.LinkedHashMap; import org.apache.droids.api.URLFilter; public class URLFiltersFactory extends GenericFactory { - private URLFilter[] filters; + /** Run all defined filters. Assume logical AND. + * @param urlString - url to test + * @return true if filter plugin accept the url, false if excluded. + */ + public boolean accept(String urlString) { + for (Iterator<String> iterator = getMap().keySet().iterator(); iterator + .hasNext();) { + if (urlString == null) + return false; + URLFilter filter = (URLFilter) getMap().get(iterator.next()); + urlString = filter.filter(urlString); + if (urlString == null) + return false; + } + return true; + } - /** Run all defined filters. Assume logical AND. */ - public String filter(String urlString) { + /** Run a specific filter class. + * @param urlString - url to test + * @param filterName - name of the specific filter class. + * @return true if filter plugin accept the url, false if excluded. + */ + public boolean accept(String urlString, String filterName) { if (urlString == null) - return null; - for (Iterator iterator = getMap().entrySet().iterator(); iterator.hasNext();) { - urlString = ((URLFilter) iterator.next()).filter(urlString); - - } - - return urlString; + return false; + URLFilter filter = (URLFilter) getMap().get(filterName); + urlString = filter.filter(urlString); + if (urlString == null) + return false; + return true; } } Modified: labs/droids/trunk/src/core/java/org/apache/droids/parse/Outlink.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/parse/Outlink.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/parse/Outlink.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/parse/Outlink.java Mon Jan 14 07:55:34 2008 @@ -24,14 +24,17 @@ public class Outlink implements Task { private String toUrl; private String anchor; + private int depth; private String taskDate=new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System .currentTimeMillis())); - public Outlink(String toUrl, String anchor){ + public Outlink(String toUrl, String anchor, int depth2){ this.toUrl=toUrl; this.anchor = anchor; + this.depth= depth2; } - public Outlink(String toUrl){ - this.toUrl=toUrl; + public Outlink(String toUrl, int depth2) { + this.toUrl=toUrl; + this.depth= depth2; } public String getToUrl() { return toUrl; } public String getAnchor() { return anchor; } @@ -40,5 +43,11 @@ } public String getTaskDate() { return taskDate; + } + public int getDepth() { + return depth; + } + public void setDepth(int depth) { + this.depth = depth; } } Modified: labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java Mon Jan 14 07:55:34 2008 @@ -24,8 +24,11 @@ import javax.xml.stream.XMLInputFactory; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.droids.api.Parse; import org.apache.droids.api.Parser; +import org.apache.droids.api.Task; import org.apache.droids.parse.Outlink; import org.apache.droids.parse.ParseData; import org.apache.droids.parse.ParseImpl; @@ -41,104 +44,120 @@ import org.xml.sax.SAXNotSupportedException; public class HtmlParser implements Parser { + protected final Log log = LogFactory.getLog(this.getClass() + .getCanonicalName()); - private XMLInputFactory inputFactory = XMLInputFactory.newInstance(); - private URL base; + private XMLInputFactory inputFactory = XMLInputFactory.newInstance(); - public Parse getParse(InputStream stream, URL base) { - this.base=base; - ParseData parseData = null; - getRemover(); - // setup filter chain - XMLDocumentFilter[] filters = { getRemover()}; - // create HTML parser - DOMFragmentParser parser = getParser(filters); - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - // parse document - // XMLInputSource source = new XMLInputSource(null, uri, uri); - try { - parser.parse(base.toExternalForm(), node); - parseData=extract(node); - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - return new ParseImpl(stream.toString(), null); - } - return new ParseImpl(stream.toString(), parseData); + private URL base; + + private Task link; + + public Parse getParse(InputStream stream, Task link) { + this.link = link; + try { + this.base = new URL(link.getId()); + } catch (MalformedURLException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + ParseData parseData = null; + // setup filter chain + XMLDocumentFilter[] filters = { getRemover() }; + // create HTML parser + DOMFragmentParser parser = getParser(filters); + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + // parse document + // XMLInputSource source = new XMLInputSource(null, uri, uri); + try { + parser.parse(base.toExternalForm(), node); + parseData = extract(node); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return new ParseImpl(stream.toString(), null); } + return new ParseImpl(stream.toString(), parseData); + } - private ParseData extract(DocumentFragment node) { - ArrayList<Outlink> links = new ArrayList<Outlink>(); - try { - extractLinks(node,links,new HashSet<String>()); - } catch (MalformedURLException e) { - e.printStackTrace(); - } - Outlink[] outlinks = new Outlink[0]; - outlinks = (Outlink[])links.toArray(new Outlink[links.size()]); - return new ParseData(outlinks); - } - - private DOMFragmentParser getParser(XMLDocumentFilter[] filters) { - DOMFragmentParser parser = new DOMFragmentParser(); - try { - parser.setProperty("http://cyberneko.org/html/properties/filters", filters); - parser - .setFeature( - "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", - false); - parser.setFeature( - "http://cyberneko.org/html/features/balance-tags/document-fragment", - true); - parser.setFeature("http://cyberneko.org/html/features/report-errors", false); - } catch (SAXNotRecognizedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (SAXNotSupportedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return parser; + private ParseData extract(DocumentFragment node) { + ArrayList<Outlink> links = new ArrayList<Outlink>(); + try { + extractLinks(node, links, new HashSet<String>()); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + Outlink[] outlinks = new Outlink[0]; + outlinks = (Outlink[]) links.toArray(new Outlink[links.size()]); + return new ParseData(outlinks); + } + + private DOMFragmentParser getParser(XMLDocumentFilter[] filters) { + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser.setProperty("http://cyberneko.org/html/properties/filters", + filters); + parser + .setFeature( + "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", + false); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/document-fragment", + true); + parser.setFeature("http://cyberneko.org/html/features/report-errors", + false); + } catch (SAXNotRecognizedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (SAXNotSupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); } + return parser; + } - private ElementRemover getRemover() { - // create element remover filter - ElementRemover remover = new ElementRemover(); - // set which elements to accept - remover.acceptElement("a", new String[] { "href" }); - // completely remove some elements - remover.removeElement("script"); - remover.removeElement("head"); - return remover; - } - - private void extractLinks(Node node, ArrayList<Outlink> links, HashSet<String> set) throws MalformedURLException { - if (node.getNodeType() == Node.ELEMENT_NODE) { - if ("a".equalsIgnoreCase(node.getNodeName())) { - NamedNodeMap attrs = node.getAttributes(); - String target; - for (int i = 0; i < attrs.getLength(); i++) { - Node attr = attrs.item(i); - String attrName = attr.getNodeName(); - if (attrName.equalsIgnoreCase("href")) { - target = attr.getNodeValue(); - try { - final Outlink outlink = new Outlink(target.contains(":/")?target:new URL (base,target).toString()); - if (!set.contains(outlink.getToUrl())) { - set.add(outlink.getToUrl()); - links.add(outlink); - } - } catch (Exception e) { } + private ElementRemover getRemover() { + // create element remover filter + ElementRemover remover = new ElementRemover(); + // set which elements to accept + remover.acceptElement("a", new String[] { "href" }); + // completely remove some elements + remover.removeElement("script"); + remover.removeElement("head"); + return remover; + } + + private void extractLinks(Node node, ArrayList<Outlink> links, + HashSet<String> set) throws MalformedURLException { + if (node.getNodeType() == Node.ELEMENT_NODE) { + if ("a".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + String target; + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (attrName.equalsIgnoreCase("href")) { + target = attr.getNodeValue(); + try { + final Outlink outlink = new Outlink( + target.contains(":/") ? target : new URL(base, target) + .toString(), link.getDepth() + 1); + if (!set.contains(outlink.getToUrl())) { + set.add(outlink.getToUrl()); + links.add(outlink); + } + } catch (Exception e) { } } } } - NodeList children = node.getChildNodes(); - if (children != null) { - int len = children.getLength(); - for (int i = 0; i < len; i++) { - extractLinks(children.item(i), links, set); - } + } + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + extractLinks(children.item(i), links, set); } } + } } Modified: labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueBean.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueBean.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueBean.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueBean.java Mon Jan 14 07:55:34 2008 @@ -20,7 +20,6 @@ public class QueueBean { - protected int maxSize = 0; protected int maxDepth = 0; @@ -63,4 +62,10 @@ this.maxDepth = maxDepth; } + public boolean acceptSize(int i){ + return (maxSize==-1)?true:maxSize>=i; + } + public boolean acceptDepth(int i){ + return (maxDepth==-1)?true:maxDepth>=i; + } } Modified: labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueLink.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueLink.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueLink.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/queue/QueueLink.java Mon Jan 14 07:55:34 2008 @@ -20,14 +20,17 @@ import org.apache.droids.api.Task; public class QueueLink implements Link,Task { - public QueueLink(String uri, String taskDate) { + public QueueLink(String uri, String taskDate, int i) { id = uri; this.taskDate=taskDate; + this.depth=i; } private String[] from, to; private String lastModifiedDate, taskDate, id; + + private int depth; public String[] getFrom() { return from; @@ -47,6 +50,14 @@ public String getTaskDate() { return taskDate; + } + + public int getDepth() { + return depth; + } + + public void setDepth(int depth) { + this.depth=depth; } } Modified: labs/droids/trunk/src/core/java/org/apache/droids/queue/Simple.java URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/queue/Simple.java?rev=611834&r1=611833&r2=611834&view=diff ============================================================================== --- labs/droids/trunk/src/core/java/org/apache/droids/queue/Simple.java (original) +++ labs/droids/trunk/src/core/java/org/apache/droids/queue/Simple.java Mon Jan 14 07:55:34 2008 @@ -43,7 +43,7 @@ LinkedList<Task> list = new LinkedList<Task>(); for (int i = 0; i < initialTask.length; i++) { Link task = (Link) initialTask[i]; - if (null != task) { + if (null != task & acceptSize(i)) { allTasks.put(task.getId(), task); list.add(task); } @@ -62,14 +62,14 @@ if (null != toDoLinks) { for (int i = 0; i < toDoLinks.length; i++) { Task task = toDoLinks[i]; - if (null != task) { + if (null != task & acceptSize(i+allTasks.size())&acceptDepth(task.getDepth())) { list.add(task); } } } for (int i = 0; i < filterLinks.length; i++) { Task task = filterLinks[i]; - if (null != task & !allTasks.containsKey(task.getId())) { + if (null != task & acceptSize(i+allTasks.size()) & !allTasks.containsKey(task.getId())&acceptDepth(task.getDepth())) { allTasks.put(task.getId(), task); list.add(task); } --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]