cmarschner 2002/06/01 11:55:16 Modified: contributions/webcrawler-LARM CHANGES.txt contributions/webcrawler-LARM/doc webcrawler_tech_overview.doc contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTask.java Filter.java MessageHandler.java URLVisitedFilter.java contributions/webcrawler-LARM/src/de/lanlab/larm/parser Tokenizer.java contributions/webcrawler-LARM/src/de/lanlab/larm/storage DocumentStorage.java LogStorage.java NullStorage.java SQLServerStorage.java contributions/webcrawler-LARM/src/de/lanlab/larm/util WebDocument.java Added: contributions/webcrawler-LARM TODO.txt build.xml contributions/webcrawler-LARM/libs placeholder contributions/webcrawler-LARM/src/de/lanlab/larm/parser EntityManager.java SimpleCharArrayWriter.java contributions/webcrawler-LARM/src/de/lanlab/larm/storage LinkLogStorage.java LinkStorage.java StoragePipeline.java Removed: contributions/webcrawler-LARM/src/hplb/misc ByteArray.java contributions/webcrawler-LARM/src/hplb/org/w3c/dom Attribute.java AttributeList.java Comment.java DOM.java Document.java DocumentContext.java DocumentFragment.java Element.java Makefile Node.java NodeIterator.java PI.java Text.java TreeIterator.java contributions/webcrawler-LARM/src/hplb/org/xml/sax AttributeMap.java DocumentHandler.java EntityHandler.java ErrorHandler.java HandlerBase.java Makefile Parser.java XmlException.java contributions/webcrawler-LARM/src/hplb/xml Atom.java AttrImpl.java AttrListImpl.java CharBuffer.java DOMImpl.java DocContextImpl.java DocumentImpl.java ElementImpl.java EntityManager.java HTML.java HtmlXmlParser.java NodeImpl.java NodeListImpl.java Parser.java SAXAttributeMap.java TextImpl.java Tokenizer.java Utils.java contributions/webcrawler-LARM/src/hplb/xml/util HtmlObserver.java HtmlScanner.java NormalizeHtml.java RmMarkup.java TokTest.java UrlScanner.java Log: added storage pipeline; some fixes on Tokenizer Revision Changes Path 1.2 +7 -0 jakarta-lucene-sandbox/contributions/webcrawler-LARM/CHANGES.txt Index: CHANGES.txt =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/CHANGES.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- CHANGES.txt 22 May 2002 23:09:16 -0000 1.1 +++ CHANGES.txt 1 Jun 2002 18:55:15 -0000 1.2 @@ -1,5 +1,12 @@ $id: $ +2002-06-01 (cmarschner) + * divided Storage into LinkStorage and DocumentStorage + * introduced StoragePipeline, made MessageHandler a LinkStorage. Fetcher now stores everything in storages + * removed a couple of unused classes + now everything's prepared for a LuceneStorage + * added build.xml by Mehran Mehr + 2002-05-23 (cmarschner) * removed 0x0d0d from the source files (Otis?) * included Apache License into all of the source files in de.lanlab.larm.* directories 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/TODO.txt Index: TODO.txt =================================================================== Todos for 1.0 (not yet ordered in decreasing priority) $id: $ * Bugs - on very fast LAN connections (100MBit), sockets are not freed as fast as allocated - some relative URLs are not appended appropriately, leading to wrong and growing URLs * Build - added build.xml, but build.bat and build.sh are still working without ANT. Change that. * LuceneStorage - define a configurable interface that saves fetched pages into a Lucene index * Configuration - move all configuration stuff into a meaningful properties file * URLs: - include a URLNormalizer * lowercase host names * avoid ambiguities like '%20' / '+' * make sure http://host URLs end with "/" * avoid host name aliases - two host names / one ip adress can point to the same web site: www.lmu.de / www.uni-muenchen.de - two host names / one ip adress can point to different web sites (then other URLs / pages must differ) suche.lmu.de / interesse.lmu.de * cater 301/302 result codes * Repository - optionally use a database as repository (caches, queues, logs) - if done so, use URL reordering to speed things up * Tests - Put all tests into a JUnit test suite * distribution - optionally send messages through a JMS topic. - create an executable that installs a source (like JMS, page files) and a storage pipeline - partition the URL space for distributed Fetchers * Speed - avoid synchronization delays by putting several URLMessages into one FetcherTask * Services - clean up ThreadMonitor - incorporate a CRON-like service that enables timed GC'ing, batched data transfer, and monitoring * Politeness - add the option to restrict the number of host accesses per hour/minute * Anchor text extraction * read until a meaningful end tag, not just the first encountered * remove entities * optionally remove Tags, leave ALT attribute * remove redundant spaces Nice-to-have: * Stop and Continue (probably with database repository) * "Hot Configure" from outside * Web Interface Next topic: * Incremental crawling 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/build.xml Index: build.xml =================================================================== <?xml version="1.0"?> <project name="webcrawler-LARM" default="build" basedir="."> <property name="name" value="webcrawler_LARM"/> <property name="version" value="0.5"/> <property name="final.name" value="${name}-${version}"/> <property name="debug" value="on"/> <property name="src.dir" value="./src"/> <property name="lib.dir" value="./libs"/> <property name="logs.dir" value="./logs"/> <property name="cache.dir" value="./cachingqueue"/> <property name="build.dir" value="./build"/> <property name="src.httpclient" value="${lib.dir}/HTTPClient.zip"/> <property name="build.classes" value="${build.dir}/src"/> <property name="build.src" value="${build.dir}/src"/> <property name="build.encoding" value="ISO-8859-1"/> <property name="threads" value="15"/> <!-- Build classpath --> <path id="classpath"> <pathelement location="${build.classes}"/> <fileset dir="${lib.dir}"> <include name="*.jar" /> </fileset> </path> <path id="run.classpath"> <pathelement location="${build.dir}/${final.name}.jar"/> <fileset dir="${lib.dir}"> <include name="*.jar" /> </fileset> </path> <!-- ================================================================== --> <!-- B U I L D --> <!-- ================================================================== --> <target name="build" description="-> builds jar file"> <mkdir dir="${build.dir}"/> <mkdir dir="${build.classes}"/> <mkdir dir="${build.src}"/> <unzip src="${src.httpclient}" dest="${build.src}" overwrite="false"/> <javac encoding="${build.encoding}" srcdir="${src.dir}:${build.src}" excludes="**/CVS/*" destdir="${build.classes}" debug="${debug}"> <classpath refid="classpath"/> </javac> <jar jarfile="${build.dir}/${final.name}.jar" basedir="${build.classes}" /> </target> <!-- ================================================================== --> <!-- Check Syntax for Run Task --> <!-- ================================================================== --> <target name="checksyntax" unless="start"> <echo> use run with the following syntax ant run -Dstart=<URL> -Drestrictto=<Pattern> [-threads=<Thread Count>] default value for threads is 15 </echo> </target> <!-- ================================================================== --> <!-- R U N --> <!-- ================================================================== --> <target name="run" depends="build, checksyntax" if="start" description="-> runs command-line version of the crawler"> <delete dir="${logs.dir}"/> <mkdir dir="${logs.dir}"/> <java classname="de.lanlab.larm.fetcher.FetcherMain" fork="yes"> <jvmarg value="-server"/> <jvmarg value="-Xmx400mb"/> <arg value="-start"/> <arg value="${start}"/> <arg value="-restrictto"/> <arg value="${restrictto}"/> <arg value="-threads"/> <arg value="${threads}"/> <classpath refid="run.classpath"/> </java> </target> <!-- ================================================================== --> <!-- C L E A N L A S T R U N --> <!-- ================================================================== --> <target name="cleanlastrun" description="-> cleans files created by each run of the crawler"> <delete dir="${logs.dir}"/> <delete dir="${cache.dir}"/> </target> <!-- ================================================================== --> <!-- C L E A N A L L --> <!-- ================================================================== --> <target name="cleanall" depends="cleanlastrun" description="-> cleans all build and run files"> <delete dir="${build.dir}"/> </target> </project> 1.2 +243 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/doc/webcrawler_tech_overview.doc <<Binary file>> 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/libs/placeholder Index: placeholder =================================================================== CVS happy 1.3 +4 -3 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java Index: Fetcher.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Fetcher.java 22 May 2002 23:09:17 -0000 1.2 +++ Fetcher.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -72,7 +72,7 @@ * so that all filtering can be made beforehand. * * @author Clemens Marschner - * @version $Id: Fetcher.java,v 1.2 2002/05/22 23:09:17 cmarschner Exp $ + * @version $Id: Fetcher.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ public class Fetcher implements MessageListener @@ -106,10 +106,11 @@ * @param storage the storage where all documents are stored * @param hostManager the host manager */ - public Fetcher(int maxThreads, DocumentStorage storage, HostManager hostManager) + public Fetcher(int maxThreads, DocumentStorage docStorage, LinkStorage linkStorage, HostManager hostManager) { this.storage = storage; - FetcherTask.setStorage(storage); + FetcherTask.setDocStorage(docStorage); + FetcherTask.setLinkStorage(linkStorage); fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager)); fetcherPool.setQueue(new FetcherTaskQueue()); docsRead = 0; 1.3 +13 -4 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java Index: FetcherMain.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- FetcherMain.java 22 May 2002 23:09:17 -0000 1.2 +++ FetcherMain.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -73,7 +73,7 @@ * * @author Clemens Marschner * @created December 16, 2000 - * @version $Id: FetcherMain.java,v 1.2 2002/05/22 23:09:17 cmarschner Exp $ + * @version $Id: FetcherMain.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ public class FetcherMain { @@ -179,7 +179,14 @@ // existing message pipeline SimpleLogger storeLog = new SimpleLogger("store", false); SimpleLogger linksLog = new SimpleLogger("links", false); - this.storage = new LogStorage(storeLog, true, "logs/pagefile"); + + + StoragePipeline storage = new StoragePipeline(); + storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile")); + storage.addLinkStorage(new LinkLogStorage(linksLog)); + storage.addLinkStorage(messageHandler); + //storage.addStorage(new LuceneStorage(...)); + //storage.addStorage(new JMSStorage(...)); // a third example would be the NullStorage, which converts the documents into // heat, which evaporates above the processor @@ -188,14 +195,14 @@ // create the filters and add them to the message queue urlScopeFilter = new URLScopeFilter(); - urlVisitedFilter = new URLVisitedFilter(100000, linksLog); + urlVisitedFilter = new URLVisitedFilter(100000); // dnsResolver = new DNSResolver(); hostManager = new HostManager(1000); reFilter = new RobotExclusionFilter(hostManager); - fetcher = new Fetcher(nrThreads, storage, hostManager); + fetcher = new Fetcher(nrThreads, storage, storage, hostManager); knownPathsFilter = new KnownPathsFilter(); @@ -206,6 +213,8 @@ // prevent GZipped files from being decoded HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class); + + // initialize the threads fetcher.init(); 1.3 +31 -11 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java Index: FetcherTask.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- FetcherTask.java 22 May 2002 23:09:17 -0000 1.2 +++ FetcherTask.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -61,6 +61,8 @@ import de.lanlab.larm.util.WebDocument; import de.lanlab.larm.util.SimpleCharArrayReader; import de.lanlab.larm.storage.DocumentStorage; +import de.lanlab.larm.storage.LinkStorage; + import de.lanlab.larm.util.State; import de.lanlab.larm.util.SimpleLogger; import de.lanlab.larm.net.HttpTimeoutFactory; @@ -79,7 +81,7 @@ * be put into the message handler again. * * @author Clemens Marschner - * @version $Id: FetcherTask.java,v 1.2 2002/05/22 23:09:17 cmarschner Exp $ + * @version $Id: FetcherTask.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ public class FetcherTask implements InterruptableTask, LinkHandler, Serializable @@ -122,9 +124,16 @@ private volatile long bytesRead = 0; /** - * the storage this task will put the document to + * the docStorage this task will put the document to + */ + private static volatile DocumentStorage docStorage; + + /** + * the docStorage this task will put the links to */ - private static volatile DocumentStorage storage; + private static volatile LinkStorage linkStorage; + + /** * task state IDs. comparisons will be done by their references, so always @@ -207,13 +216,23 @@ /** - * Sets the document storage + * Sets the document docStorage + * + * @param docStorage The new docStorage + */ + public static void setDocStorage(DocumentStorage docStorage) + { + FetcherTask.docStorage = docStorage; + } + + /** + * Sets the document linkStorage * - * @param storage The new storage + * @param linkStorage The new linkStorage */ - public static void setStorage(DocumentStorage storage) + public static void setLinkStorage(LinkStorage linkStorage) { - FetcherTask.storage = storage; + FetcherTask.linkStorage = linkStorage; } @@ -382,8 +401,9 @@ log.log("scanned"); } taskState.setState(FT_STORING, ipURL); - messageHandler.putMessages(foundUrls); - storage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title)); + linkStorage.storeLinks(foundUrls); + //messageHandler.putMessages(foundUrls); + docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title)); log.log("stored"); } } @@ -519,8 +539,8 @@ /** - * this is called whenever a links was found in the current document, - * Don't create too many objects here, this will be called + * this is called whenever a link was found in the current document, + * Don't create too many objects here, as this will be called * millions of times * * @param link Description of the Parameter 1.3 +2 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java Index: Filter.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Filter.java 22 May 2002 23:09:17 -0000 1.2 +++ Filter.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -52,12 +52,12 @@ * <http://www.apache.org/>. */ - package de.lanlab.larm.fetcher; +package de.lanlab.larm.fetcher; /** * base class of all filter classes - * @version $Id: Filter.java,v 1.2 2002/05/22 23:09:17 cmarschner Exp $ + * @version $Id: Filter.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ public abstract class Filter { 1.3 +21 -9 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageHandler.java Index: MessageHandler.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageHandler.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- MessageHandler.java 22 May 2002 23:09:17 -0000 1.2 +++ MessageHandler.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -58,6 +58,7 @@ import de.lanlab.larm.util.SimpleObservable; import de.lanlab.larm.util.CachingQueue; import de.lanlab.larm.util.UnderflowException; +import de.lanlab.larm.storage.LinkStorage; /** * this is a message handler that runs in its own thread. @@ -69,9 +70,9 @@ * object, usually the one they got.<br> * The filters will run synchronously within the message handler thread<br> * This implements a chain of responsibility-style message handling - * @version $Id: MessageHandler.java,v 1.2 2002/05/22 23:09:17 cmarschner Exp $ + * @version $Id: MessageHandler.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ -public class MessageHandler implements Runnable +public class MessageHandler implements Runnable, LinkStorage { /** @@ -118,7 +119,7 @@ /** * messageHandler-Thread erzeugen und starten */ - MessageHandler() + public MessageHandler() { t = new Thread(this,"MessageHandler Thread"); t.setPriority(5); // higher priority to prevent starving when a lot of fetcher threads are used @@ -175,7 +176,7 @@ /** - * einen Event in die Schlange schreiben + * insert one message into the queue */ public void putMessage(Message msg) { @@ -208,6 +209,13 @@ } } + public Collection storeLinks(Collection links) + { + putMessages(links); + return links; + } + + /** * the main messageHandler-Thread. */ @@ -251,22 +259,22 @@ messageQueueObservable.setChanged(); messageQueueObservable.notifyObservers(new Integer(-1)); // Message processed - // und verteilen. Die Listener erhalten die Message in ihrer - // Eintragungsreihenfolge und können die Message auch verändern + // now distribute them. The handlers get the messages in the order + // of insertion and have the right to change them Iterator i = listeners.iterator(); while(i.hasNext()) { - //System.out.println("Verteile..."); try { MessageListener listener = (MessageListener)i.next(); m = (Message)listener.handleRequest(m); if (m == null) { + // handler has consumed the message messageProcessorObservable.setChanged(); messageProcessorObservable.notifyObservers(listener); - break; // Handler hat die Message konsumiert + break; } } catch(ClassCastException e) @@ -285,7 +293,7 @@ messagesWaiting = false; // System.out.println("MessageHandler: messagesWaiting = true although nothing queued!"); // @FIXME: here is still a multi threading issue. I don't get it why this happens. - // does someone want to draw a petri net of this? + // does someone want to draw a petri net of this? ;-) } catch (Exception e) { @@ -299,5 +307,9 @@ public int getQueued() { return messageQueue.size(); + } + + public void openLinkStorage() + { } } 1.3 +4 -12 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java Index: URLVisitedFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- URLVisitedFilter.java 22 May 2002 23:09:17 -0000 1.2 +++ URLVisitedFilter.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -67,9 +67,9 @@ * * @author Clemens Marschner * @created 3. Januar 2002 - * @version $Id: URLVisitedFilter.java,v 1.2 2002/05/22 23:09:17 cmarschner Exp $ + * @version $Id: URLVisitedFilter.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ -class URLVisitedFilter extends Filter implements MessageListener +public class URLVisitedFilter extends Filter implements MessageListener { /** @@ -79,13 +79,10 @@ */ public void notifyAddedToMessageHandler(MessageHandler handler) { - this.messageHandler = handler; } - MessageHandler messageHandler; - - SimpleLogger log; + //SimpleLogger log; HashSet urlHash; @@ -98,10 +95,9 @@ * * @param initialHashCapacity Description of the Parameter */ - public URLVisitedFilter(int initialHashCapacity, SimpleLogger log) + public URLVisitedFilter(int initialHashCapacity) { urlHash = new HashSet(initialHashCapacity); - this.log = log; //urlVector = new Vector(initialHashCapacity); } @@ -132,10 +128,6 @@ { //System.out.println("URLVisitedFilter: " + urlString + " already present."); filtered++; - if(log != null) - { - log.log(urlMessage.getInfo()); - } return null; } else 1.3 +19 -29 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java Index: Tokenizer.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Tokenizer.java 22 May 2002 23:09:18 -0000 1.2 +++ Tokenizer.java 1 Jun 2002 18:55:15 -0000 1.3 @@ -54,14 +54,9 @@ package de.lanlab.larm.parser; -import hplb.org.xml.sax.*; -import hplb.xml.*; -import hplb.xml.util.*; - import java.util.Dictionary; import java.util.Hashtable; import java.io.*; -import hplb.misc.ByteArray; import java.net.URL; /** @@ -71,42 +66,37 @@ * some bugs. And it's FAST, about 10 x faster than the original HEX parser. * Being some sort of SAX parser it calls the callback functions of the LinkHandler * when links are found. - * @todo add handling of anchor texts + * Attention: This parser is not thread safe, as a lot of locks were removed * * @author Clemens Marschner - * $Id: Tokenizer.java,v 1.2 2002/05/22 23:09:18 cmarschner Exp $ + * $Id: Tokenizer.java,v 1.3 2002/06/01 18:55:15 cmarschner Exp $ */ -public class Tokenizer implements hplb.org.xml.sax.Parser +public class Tokenizer { /** * Sets the entityHandler attribute of the Tokenizer object * * @param e The new entityHandler value - */ - public void setEntityHandler(hplb.org.xml.sax.EntityHandler e) { } - + * + public void setEntityHandler(EntityHandler e) { } + */ /** * Sets the errorHandler attribute of the Tokenizer object * * @param e The new errorHandler value - */ + * public void setErrorHandler(hplb.org.xml.sax.ErrorHandler e) { } - + */ /** * Sets the documentHandler attribute of the Tokenizer object * * @param e The new documentHandler value - */ + * public void setDocumentHandler(hplb.org.xml.sax.DocumentHandler e) { } - - - /** - * The value of boolean attributes is this string. - */ - public final static String BOOLATTR = Atom.getAtom("BOOLATTR"); + */ // FSM states: final static int ST_START = 1; @@ -173,17 +163,17 @@ private boolean keepPCData; private boolean isInTitleTag; private boolean isInAnchorTag; - CharBuffer buf = new CharBuffer(); + SimpleCharArrayWriter buf = new SimpleCharArrayWriter(); boolean isStartTag = true; /** * Signals whether a non-empty element has any children. If not we must * generate an artificial empty-string child [characters(buf, 0, 0)]. */ boolean noChildren; - CharBuffer tagname = new CharBuffer(); - CharBuffer attrName = new CharBuffer(); - CharBuffer attrValue = new CharBuffer(1000); - CharBuffer pcData = new CharBuffer(8000); + SimpleCharArrayWriter tagname = new SimpleCharArrayWriter(); + SimpleCharArrayWriter attrName = new SimpleCharArrayWriter(); + SimpleCharArrayWriter attrValue = new SimpleCharArrayWriter(1000); + SimpleCharArrayWriter pcData = new SimpleCharArrayWriter(8000); int pcDataLength; /** @@ -722,7 +712,7 @@ // the next end tag, at most 200 characters. // (end tags are often ommited, i.e. <a ...>text</td>) // regards other tags as text - // todo: read until next </a> or a couple other tags + // @todo: read until next </a> or a couple of other tags try { short count = 0; @@ -991,7 +981,7 @@ * Description of the Method * * @param attrs Description of the Parameter - */ + * public final void keysToLowerCase(SAXAttributeMap attrs) { for (int i = 0; i < attrs.n; i++) @@ -1003,7 +993,7 @@ } } } - + */ // toomuch true iff we read a '<' of the next token /** @@ -1036,7 +1026,7 @@ * if (toomuch) { * buf.setLength(buf.size() - 1); * } - * CharBuffer buf1 = rcgnzEntities ? entMngr.entityDecode(buf) : buf; + * SimpleCharArrayWriter buf1 = rcgnzEntities ? entMngr.entityDecode(buf) : buf; * docHandler.characters(buf1.getCharArray(), 0, buf1.size()); * /handler.gotText(getBuffer()); * toStart(); 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/EntityManager.java Index: EntityManager.java =================================================================== /* * ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package de.lanlab.larm.parser; import java.util.Hashtable; import java.io.*; /** * A very simple entity manager. Based on HeX, the HTML enabled XML parser, by * Anders Kristensen, HP Labs Bristol * * @author Administrator * @created 1. Juni 2002 */ public class EntityManager { /** * Description of the Field */ protected Hashtable entities = new Hashtable(); /** * Description of the Field */ private Tokenizer tok; /** * Constructor for the EntityManager object * * @param tok Description of the Parameter */ public EntityManager(Tokenizer tok) { this.tok = tok; entities.put("amp", "&"); entities.put("lt", "<"); entities.put("gt", ">"); entities.put("apos", "'"); entities.put("quot", "\""); entities.put("auml", "ä"); entities.put("ouml", "ö"); entities.put("uuml", "ü"); entities.put("Auml", "Ä"); entities.put("Ouml", "Ö"); entities.put("Uuml", "Ü"); entities.put("szlig", "ß"); } /** * Finds entitiy and character references in the provided char array and * decodes them. The operation is destructive, i.e. the encoded string * replaces the original - this is atrightforward since the new string can * only get shorter. * * @param buffer Description of the Parameter * @return Description of the Return Value * @exception Exception Description of the Exception */ public final SimpleCharArrayWriter entityDecode(SimpleCharArrayWriter buffer) throws Exception { char[] buf = buffer.getCharArray(); // avoids method calls int len = buffer.size(); // not fastest but certainly simplest: if (indexOf(buf, '&', 0, len) == -1) { return buffer; } SimpleCharArrayWriter newbuf = new SimpleCharArrayWriter(len); for (int start = 0; ; ) { int x = indexOf(buf, '&', start, len); if (x == -1) { newbuf.write(buf, start, len - start); return newbuf; } else { newbuf.write(buf, start, x - start); start = x + 1; x = indexOf(buf, ';', start, len); if (x == -1) { //tok.warning("Entity reference not semicolon terminated"); newbuf.write('&'); //break; //??????????? } else { try { writeEntityDef(buf, start, x - start, newbuf); start = x + 1; } catch (Exception ex) { //tok.warning("Bad entity reference"); } } } } } // character references are rare enough that we don't care about // creating a String object for them unnecessarily... /** * Description of the Method * * @param buf Description of the Parameter * @param off Description of the Parameter * @param len Description of the Parameter * @param out Description of the Parameter * @exception Exception Description of the Exception * @exception IOException Description of the Exception * @exception NumberFormatException Description of the Exception */ public void writeEntityDef(char[] buf, int off, int len, Writer out) throws Exception, IOException, NumberFormatException { Integer ch; //System.out.println("Entity: " + new String(buf, off, len) +" "+off+" "+len); if (buf[off] == '#') { // character reference off++; len--; if (buf[off] == 'x' || buf[off] == 'X') { ch = Integer.valueOf(new String(buf, off + 1, len - 1), 16); } else { ch = Integer.valueOf(new String(buf, off, len)); } out.write(ch.intValue()); } else { String ent = new String(buf, off, len); String val = (String) entities.get(ent); if (val != null) { out.write(val); } else { out.write("&" + ent + ";"); //tok.warning("unknown entity reference: " + ent); } } } /** * Description of the Method * * @param entity Description of the Parameter * @param value Description of the Parameter * @return Description of the Return Value */ public String defTextEntity(String entity, String value) { return (String) entities.put(entity, value); } /** * Returns the index within this String of the first occurrence of the * specified character, starting the search at fromIndex. This method * returns -1 if the character is not found. * * @param buf Description of the Parameter * @param ch Description of the Parameter * @param from Description of the Parameter * @param to Description of the Parameter * @return Description of the Return Value * @params buf the buffer to search * @params ch the character to search for * @params from the index to start the search * from * @params to the highest possible index returned * plus 1 * @throws IndexOutOfBoundsException if index out of bounds... */ public final static int indexOf(char[] buf, int ch, int from, int to) { int i; for (i = from; i < to && buf[i] != ch; i++) { ; } // do nothing if (i < to) { return i; } else { return -1; } } } 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/SimpleCharArrayWriter.java Index: SimpleCharArrayWriter.java =================================================================== package de.lanlab.larm.parser; import java.io.CharArrayWriter; /** * <p>Title: </p> * <p>Description: </p> * <p>Copyright: Copyright (c) 2002</p> * <p>Company: </p> * @author unascribed * @version 1.0 */ public final class SimpleCharArrayWriter extends java.io.CharArrayWriter { public SimpleCharArrayWriter() { super(); } public SimpleCharArrayWriter(int size) { super(size); } // use only to *decrement* size public void setLength(int size) { // synchronized (lock) { if (size < count) count = size; // } } public char[] getCharArray() { // synchronized (lock) { return buf; // } } public int getLength() { return count; } } 1.3 +4 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/DocumentStorage.java Index: DocumentStorage.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/DocumentStorage.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- DocumentStorage.java 22 May 2002 23:09:18 -0000 1.2 +++ DocumentStorage.java 1 Jun 2002 18:55:16 -0000 1.3 @@ -72,6 +72,9 @@ * called to store a web document * * @param doc the document + * @return the document itself or a changed version. Only makes sense if + * storage pipeline is used; usually the storage would return the document + * as is. */ - public void store(WebDocument doc); + public WebDocument store(WebDocument doc); } 1.3 +6 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java Index: LogStorage.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- LogStorage.java 22 May 2002 23:09:18 -0000 1.2 +++ LogStorage.java 1 Jun 2002 18:55:16 -0000 1.3 @@ -191,11 +191,14 @@ /** - * stores the document if storing is enabled + * writes file info to log file; + * stores the document if storing is enabled. in that case the log line contains + * the page file number and the index within that file * * @param doc Description of the Parameter + * @return the unchanged document */ - public void store(WebDocument doc) + public WebDocument store(WebDocument doc) { String docInfo = doc.getInfo(); if (logContents && isValid && doc.getDocumentBytes() != null) @@ -204,5 +207,6 @@ docInfo = docInfo + "\t" + pageFileCount + "\t" + offset; } log.logThreadSafe(docInfo); + return doc; } } 1.3 +8 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java Index: NullStorage.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- NullStorage.java 22 May 2002 23:09:18 -0000 1.2 +++ NullStorage.java 1 Jun 2002 18:55:16 -0000 1.3 @@ -65,7 +65,13 @@ { } - public void open() {} - public void store(WebDocument doc) {} + public void open() + { + } + + public WebDocument store(WebDocument doc) + { + return doc; + } } 1.3 +7 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java Index: SQLServerStorage.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- SQLServerStorage.java 22 May 2002 23:09:18 -0000 1.2 +++ SQLServerStorage.java 1 Jun 2002 18:55:16 -0000 1.3 @@ -192,7 +192,12 @@ } } - public void store(WebDocument document) + /** + * + * @param document + * @return the unchanged document + */ + public WebDocument store(WebDocument document) { PreparedStatement addDoc = null; @@ -217,5 +222,6 @@ releaseStatement(addDoc); } } + return document; } } 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LinkLogStorage.java Index: LinkLogStorage.java =================================================================== package de.lanlab.larm.storage; import de.lanlab.larm.storage.LinkStorage; import de.lanlab.larm.util.SimpleLogger; import de.lanlab.larm.fetcher.URLMessage; import java.util.Collection; import java.util.Iterator; /** * Description of the Class * * @author Administrator * @created 1. Juni 2002 */ public class LinkLogStorage implements LinkStorage { SimpleLogger log; /** * Constructor for the LinkLogStorage object * * @param logFile Description of the Parameter */ public LinkLogStorage(SimpleLogger logFile) { this.log = logFile; } /** * empty */ public void openLinkStorage() { } /** * Description of the Method * * @param c Description of the Parameter * @return Description of the Return Value */ public Collection storeLinks(Collection c) { synchronized (log) { for (Iterator it = c.iterator(); it.hasNext(); ) { log.log(((URLMessage) it.next()).getInfo()); } } return c; } } 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LinkStorage.java Index: LinkStorage.java =================================================================== /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package de.lanlab.larm.storage; import java.util.Collection; public interface LinkStorage { /** * Description of the Method */ public void openLinkStorage(); /** * stores the extracted links may contain links of more than one document * * @param c Description of the Parameter * @return the collection, may have been changed or set to null */ public Collection storeLinks(Collection c); } 1.1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/StoragePipeline.java Index: StoragePipeline.java =================================================================== /* * ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package de.lanlab.larm.storage; import de.lanlab.larm.util.WebDocument; import de.lanlab.larm.fetcher.URLMessage; import java.util.ArrayList; import java.util.Iterator; import java.util.Collection; /** * @author Clemens Marschner * @created 1. Juni 2002 * @version $ver: $ */ public class StoragePipeline implements DocumentStorage, LinkStorage { boolean isOpen; boolean isLinkStorageOpen; ArrayList docStorages; ArrayList linkStorages; /** * Constructor for the StoragePipeline object */ public StoragePipeline() { isOpen = false; isLinkStorageOpen = false; docStorages = new ArrayList(); linkStorages = new ArrayList(); } /** * open all docStorages */ public void open() { for (Iterator it = docStorages.iterator(); it.hasNext(); ) { ((DocumentStorage) it.next()).open(); } isOpen = true; } /** * store the doc into all docStorages * document is discarded if a storage.store() returns null * * @see de.lanlab.larm.storage.WebDocument#store * @param doc Description of the Parameter * @return Description of the Return Value */ public WebDocument store(WebDocument doc) { for(Iterator it = docStorages.iterator(); it.hasNext();) { doc = ((DocumentStorage)it.next()).store(doc); if(doc == null) { break; } } return doc; } /** * Adds a feature to the Storage attribute of the StoragePipeline object * * @param storage The feature to be added to the Storage attribute */ public void addDocStorage(DocumentStorage storage) { if (isOpen) { throw new IllegalStateException("storage can't be added if pipeline is already open"); } docStorages.add(storage); } /** * Adds a feature to the Storage attribute of the StoragePipeline object * * @param storage The feature to be added to the Storage attribute */ public void addLinkStorage(LinkStorage storage) { if (isOpen) { throw new IllegalStateException("storage can't be added if pipeline is already open"); } linkStorages.add(storage); } public void openLinkStorage() { for (Iterator it = linkStorages.iterator(); it.hasNext(); ) { ((LinkStorage) it.next()).openLinkStorage(); } isLinkStorageOpen = true; } public Collection storeLinks(Collection c) { for(Iterator it = linkStorages.iterator(); it.hasNext();) { c = ((LinkStorage)it.next()).storeLinks(c); if(c == null) { break; } } return c; } } 1.3 +1 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java Index: WebDocument.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- WebDocument.java 22 May 2002 23:09:19 -0000 1.2 +++ WebDocument.java 1 Jun 2002 18:55:16 -0000 1.3 @@ -141,7 +141,7 @@ this.resultCode + "\t" + this.mimeType + "\t" + this.size + "\t" + - "\"" + this.title.replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\""; + "\"" + this.title.replace('\t',' ').replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\""; }
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>