Modified: incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java Tue Dec 18 08:47:39 2012 @@ -36,63 +36,63 @@ import org.springframework.context.suppo public class TestSimpleDroid { - protected LocalHttpServer testserver; + protected LocalHttpServer testserver; - private final static ApplicationContext context = new ClassPathXmlApplicationContext( - "classpath:/droids-core-test-context.xml"); + private final static ApplicationContext context = new ClassPathXmlApplicationContext( + "classpath:/droids-core-test-context.xml"); - private DroidsConfig droidsConfig = null; + private DroidsConfig droidsConfig = null; - @Before - public void setUp() throws Exception { - this.droidsConfig = (DroidsConfig) TestSimpleDroid.context - .getBean("org.apache.droids.dynamic.DroidsConfig"); - this.testserver = new LocalHttpServer(); - } - - @Test - public void testReportCrawlingDroid() throws Exception { - this.testserver.register("*", new ResourceHandler()); - this.testserver.start(); - - String baseURI = "http:/" + this.testserver.getServiceAddress(); - String targetURI = baseURI + "/start_html"; - - Droid<Link> droid = createSimpleReportCrawlingDroid(targetURI); - - droid.init(); - droid.start(); - droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS); - - Assert.assertFalse(ReportHandler.getReport().isEmpty()); - Assert.assertEquals(5, ReportHandler.getReport().size()); - Assert.assertTrue(ReportHandler.getReport().contains( - baseURI + "/start_html")); - Assert.assertTrue(ReportHandler.getReport().contains( - baseURI + "/page1_html")); - Assert.assertTrue(ReportHandler.getReport().contains( - baseURI + "/page2_html")); - Assert.assertTrue(ReportHandler.getReport().contains( - baseURI + "/page3_html")); - Assert.assertTrue(ReportHandler.getReport().contains( - baseURI + "/page4_html")); - - ReportHandler.recycle(); - } - - private Droid<Link> createSimpleReportCrawlingDroid(final String targetURI) { - Droid<Link> droid = this.droidsConfig.getDroid("report"); - - Assert.assertFalse("Droid is null.", droid == null); - Assert.assertTrue( - "The test droid must be an instance of ReportCrawlingDroid", - droid instanceof ReportCrawlingDroid); - - final List<String> locations = new ArrayList<String>(); - locations.add(targetURI); - ((CrawlingDroid) droid).setInitialLocations(locations); + @Before + public void setUp() throws Exception { + this.droidsConfig = (DroidsConfig) TestSimpleDroid.context + .getBean("org.apache.droids.dynamic.DroidsConfig"); + this.testserver = new LocalHttpServer(); + } + + @Test + public void testReportCrawlingDroid() throws Exception { + this.testserver.register("*", new ResourceHandler()); + this.testserver.start(); + + String baseURI = "http:/" + this.testserver.getServiceAddress(); + String targetURI = baseURI + "/start_html"; + + Droid<Link> droid = createSimpleReportCrawlingDroid(targetURI); + + droid.init(); + droid.start(); + droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS); + + Assert.assertFalse(ReportHandler.getReport().isEmpty()); + Assert.assertEquals(5, ReportHandler.getReport().size()); + Assert.assertTrue(ReportHandler.getReport().contains( + baseURI + "/start_html")); + Assert.assertTrue(ReportHandler.getReport().contains( + baseURI + "/page1_html")); + Assert.assertTrue(ReportHandler.getReport().contains( + baseURI + "/page2_html")); + Assert.assertTrue(ReportHandler.getReport().contains( + baseURI + "/page3_html")); + Assert.assertTrue(ReportHandler.getReport().contains( + baseURI + "/page4_html")); + + ReportHandler.recycle(); + } + + private Droid<Link> createSimpleReportCrawlingDroid(final String targetURI) { + Droid<Link> droid = this.droidsConfig.getDroid("report"); + + Assert.assertFalse("Droid is null.", droid == null); + Assert.assertTrue( + "The test droid must be an instance of ReportCrawlingDroid", + droid instanceof ReportCrawlingDroid); + + final List<String> locations = new ArrayList<String>(); + locations.add(targetURI); + ((CrawlingDroid) droid).setInitialLocations(locations); - return droid; - } + return droid; + } }
Modified: incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml Tue Dec 18 08:47:39 2012 @@ -15,105 +15,105 @@ See the License for the specific language governing permissions and limitations under the License. --> - <!-- - Using your own context - +++++++++++++++++++++++++ - The easiest way is to - a) create a droids-your-context.xml - b) add: - <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/> - c) implement your own beans which will override the imported ones - d) Call the ant target like: - ant droids.crawl default -Ddroids.spring.context=PATH/droids-your-context.xml - --> +<!-- + Using your own context + +++++++++++++++++++++++++ + The easiest way is to + a) create a droids-your-context.xml + b) add: + <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/> + c) implement your own beans which will override the imported ones + d) Call the ant target like: + ant droids.crawl default -Ddroids.spring.context=PATH/droids-your-context.xml +--> <beans xmlns="http://www.springframework.org/schema/beans" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xmlns:configurator="http://cocoon.apache.org/schema/configurator" - xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:configurator="http://cocoon.apache.org/schema/configurator" + xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.1.0.xsd"> - - <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/> - - <!-- configuration properties file --> - <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer"> - <property name="locations" value="classpath:/droids-core.properties"/> - </bean> - - <bean name="taskExceptionHandler" - class="org.apache.droids.impl.DefaultTaskExceptionHandler"> - </bean> - - <bean name="taskMaster" - class="org.apache.droids.impl.MultiThreadedTaskMaster"> - <property name="exceptionHandler" ref="taskExceptionHandler" /> - <property name="delayTimer" ref="org.apache.droids.delay.SimpleDelayTimer"/> - <!--<property name="maxThreads" value="${droids.maxThreads}"/>--> - </bean> - - <!-- Droids --> - <bean name="org.apache.droids.api.Droid/report" - class="org.apache.droids.robot.crawler.ReportCrawlingDroid"> - <constructor-arg ref="java.util.LinkedList" /> - <constructor-arg ref="taskMaster" /> - - - <property name="protocolFactory" ref="org.apache.droids.helper.factories.ProtocolFactory"/> - <property name="parserFactory" ref="org.apache.droids.helper.factories.ParserFactory"/> - <property name="filtersFactory" ref="org.apache.droids.helper.factories.FilterFactory"/> - </bean> - <!-- Queue --> - <bean id="java.util.LinkedList" - class="java.util.LinkedList"> - </bean> - <!-- Protocol --> - <bean - name="org.apache.droids.api.Protocol/http" - class="org.apache.droids.protocol.http.HttpProtocol" scope="singleton"> - <property name="userAgent" value="DROIDS-crawler-x-m01y08"/> - <property name="forceAllow" value="${droids.protocol.http.force}"/> - </bean> - <bean name="org.apache.droids.api.Protocol/file" - class="org.apache.droids.protocol.file.FileProtocol" scope="singleton"/> - <!-- Parser --> - <bean - name="text/html" - class="org.apache.droids.parse.html.HtmlParser"> - <property name="elements"> - <map> - <entry key="a" value="href"/> - <entry key="link" value="href"/> - <entry key="img" value="src"/> - <entry key="script" value="src"/> - </map> - </property> - </bean> - <!-- Filter --> - <bean - name="org.apache.droids.api.URLFilter/org.apache.droids.net.RegexURLFilter" - class="org.apache.droids.net.RegexURLFilter"> - <property name="file" value="${droids.filter.regex}"/> - </bean> - <!-- Handler --> - <bean - name="org.apache.droids.api.Handler/org.apache.droids.handle.SysoutHandler" - class="org.apache.droids.handle.SysoutHandler"/> - <bean - name="org.apache.droids.api.Handler/org.apache.droids.handle.SaveHandler" - class="org.apache.droids.handle.SaveHandler"> - <property name="saveContentHandlerStrategy" - ref="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy" /> - </bean> - <bean - name="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy" - class="org.apache.droids.handle.DefaultSaveContentHandlerStrategy"> - <property name="includeHost" value="true" /> - <property name="outputDir" value="tmp/" /> - </bean> - - - <bean - name="org.apache.droids.delay.SimpleDelayTimer" - class="org.apache.droids.delay.SimpleDelayTimer"> - <property name="delayMillis" value="${droids.delay.request}"/> - </bean> + + <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/> + + <!-- configuration properties file --> + <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer"> + <property name="locations" value="classpath:/droids-core.properties"/> + </bean> + + <bean name="taskExceptionHandler" + class="org.apache.droids.impl.DefaultTaskExceptionHandler"> + </bean> + + <bean name="taskMaster" + class="org.apache.droids.impl.MultiThreadedTaskMaster"> + <property name="exceptionHandler" ref="taskExceptionHandler"/> + <property name="delayTimer" ref="org.apache.droids.delay.SimpleDelayTimer"/> + <!--<property name="maxThreads" value="${droids.maxThreads}"/>--> + </bean> + + <!-- Droids --> + <bean name="org.apache.droids.api.Droid/report" + class="org.apache.droids.robot.crawler.ReportCrawlingDroid"> + <constructor-arg ref="java.util.LinkedList"/> + <constructor-arg ref="taskMaster"/> + + + <property name="protocolFactory" ref="org.apache.droids.helper.factories.ProtocolFactory"/> + <property name="parserFactory" ref="org.apache.droids.helper.factories.ParserFactory"/> + <property name="filtersFactory" ref="org.apache.droids.helper.factories.FilterFactory"/> + </bean> + <!-- Queue --> + <bean id="java.util.LinkedList" + class="java.util.LinkedList"> + </bean> + <!-- Protocol --> + <bean + name="org.apache.droids.api.Protocol/http" + class="org.apache.droids.protocol.http.HttpProtocol" scope="singleton"> + <property name="userAgent" value="DROIDS-crawler-x-m01y08"/> + <property name="forceAllow" value="${droids.protocol.http.force}"/> + </bean> + <bean name="org.apache.droids.api.Protocol/file" + class="org.apache.droids.protocol.file.FileProtocol" scope="singleton"/> + <!-- Parser --> + <bean + name="text/html" + class="org.apache.droids.parse.html.HtmlParser"> + <property name="elements"> + <map> + <entry key="a" value="href"/> + <entry key="link" value="href"/> + <entry key="img" value="src"/> + <entry key="script" value="src"/> + </map> + </property> + </bean> + <!-- Filter --> + <bean + name="org.apache.droids.api.URLFilter/org.apache.droids.net.RegexURLFilter" + class="org.apache.droids.net.RegexURLFilter"> + <property name="file" value="${droids.filter.regex}"/> + </bean> + <!-- Handler --> + <bean + name="org.apache.droids.api.Handler/org.apache.droids.handle.SysoutHandler" + class="org.apache.droids.handle.SysoutHandler"/> + <bean + name="org.apache.droids.api.Handler/org.apache.droids.handle.SaveHandler" + class="org.apache.droids.handle.SaveHandler"> + <property name="saveContentHandlerStrategy" + ref="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"/> + </bean> + <bean + name="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy" + class="org.apache.droids.handle.DefaultSaveContentHandlerStrategy"> + <property name="includeHost" value="true"/> + <property name="outputDir" value="tmp/"/> + </bean> + + + <bean + name="org.apache.droids.delay.SimpleDelayTimer" + class="org.apache.droids.delay.SimpleDelayTimer"> + <property name="delayMillis" value="${droids.delay.request}"/> + </bean> </beans> Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml Tue Dec 18 08:47:39 2012 @@ -15,53 +15,54 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>droids</artifactId> - <groupId>org.apache.droids</groupId> - <version>0.3.0-incubating-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> - </parent> - <artifactId>droids-tika</artifactId> - <name>Apache Droids Tika</name> - <inceptionYear>2007</inceptionYear> - <description>Apache Droids Tika Parser</description> - <packaging>jar</packaging> + <modelVersion>4.0.0</modelVersion> + <parent> + <artifactId>droids</artifactId> + <groupId>org.apache.droids</groupId> + <version>0.3.0-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>droids-tika</artifactId> + <name>Apache Droids Tika</name> + <inceptionYear>2007</inceptionYear> + <description>Apache Droids Tika Parser</description> + <packaging>jar</packaging> - <properties> - <tika-release-version>1.1</tika-release-version> - </properties> + <properties> + <tika-release-version>1.1</tika-release-version> + </properties> - <dependencies> - <dependency> - <groupId>org.apache.droids</groupId> - <artifactId>droids-core</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-core</artifactId> - <version>${tika-release-version}</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parsers</artifactId> - <version>${tika-release-version}</version> - <exclusions> - <exclusion> - <artifactId>commons-logging</artifactId> - <groupId>commons-logging</groupId> - </exclusion> - </exclusions> - </dependency> - <!-- test dependencies --> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>${junit.version}</version> - <scope>test</scope> - </dependency> - </dependencies> + <dependencies> + <dependency> + <groupId>org.apache.droids</groupId> + <artifactId>droids-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>${tika-release-version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>${tika-release-version}</version> + <exclusions> + <exclusion> + <artifactId>commons-logging</artifactId> + <groupId>commons-logging</groupId> + </exclusion> + </exclusions> + </dependency> + <!-- test dependencies --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>${junit.version}</version> + <scope>test</scope> + </dependency> + </dependencies> </project> Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java Tue Dec 18 08:47:39 2012 @@ -54,76 +54,75 @@ import org.xml.sax.SAXException; /** * Parses documents using Tika. * Any document type that Tika can handle, can be handled by this class, - * including HTML. - * + * including HTML. */ public class TikaDocumentParser implements TikaParser { - protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class); - - @Override - public TikaParse parse(ContentEntity entity, Task task) throws DroidsException, - IOException { - // Init Tika objects - org.apache.tika.parser.Parser parser = new AutoDetectParser(); - Metadata metadata = new Metadata(); - - String charset = entity.getCharset(); - if (charset == null) { - charset = "UTF-8"; - } - - StringWriter dataBuffer = new StringWriter(); - StringWriter bodyBuffer = new StringWriter(); - StringWriter mainContentBuffer = new StringWriter(); - - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); - TransformerHandler xmlHandler; - try { - xmlHandler = factory.newTransformerHandler(); - } catch (TransformerConfigurationException e) { - throw new DroidsException(e); - } - xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); - xmlHandler.setResult(new StreamResult(dataBuffer)); - - BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer); - BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer); - LinkContentHandler linkHandler = new LinkContentHandler(); - - TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler ); - - InputStream instream = entity.obtainContent(); - try { - parser.parse(instream, parallelHandler, metadata, new ParseContext()); - - ArrayList<Link> extractedTasks = new ArrayList<Link>(); - int depth = task.getDepth() + 1; - if (task instanceof LinkTask) { - for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) { - try { - URI uri = new URI(tikaLink.getUri()); - // Test to see if the scheme is empty - // This would indicate a relative URL, so resolve it against the task URI - if(uri.getScheme() == null) { - uri = ((Link) task).getURI().resolve(uri); + protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class); + + @Override + public TikaParse parse(ContentEntity entity, Task task) throws DroidsException, + IOException { + // Init Tika objects + org.apache.tika.parser.Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + + String charset = entity.getCharset(); + if (charset == null) { + charset = "UTF-8"; + } + + StringWriter dataBuffer = new StringWriter(); + StringWriter bodyBuffer = new StringWriter(); + StringWriter mainContentBuffer = new StringWriter(); + + SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + TransformerHandler xmlHandler; + try { + xmlHandler = factory.newTransformerHandler(); + } catch (TransformerConfigurationException e) { + throw new DroidsException(e); + } + xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + xmlHandler.setResult(new StreamResult(dataBuffer)); + + BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer); + BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer); + LinkContentHandler linkHandler = new LinkContentHandler(); + + TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler); + + InputStream instream = entity.obtainContent(); + try { + parser.parse(instream, parallelHandler, metadata, new ParseContext()); + + ArrayList<Link> extractedTasks = new ArrayList<Link>(); + int depth = task.getDepth() + 1; + if (task instanceof LinkTask) { + for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) { + try { + URI uri = new URI(tikaLink.getUri()); + // Test to see if the scheme is empty + // This would indicate a relative URL, so resolve it against the task URI + if (uri.getScheme() == null) { + uri = ((Link) task).getURI().resolve(uri); + } + extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText())); + } catch (URISyntaxException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("URI not valid: " + tikaLink.getUri()); + } + } + } } - extractedTasks.add(new LinkTask((Link)task, uri, depth, tikaLink.getText())); - } catch (URISyntaxException e) { - if(LOG.isWarnEnabled()) { - LOG.warn("URI not valid: "+ tikaLink.getUri()); - } - } - } - } - return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata); - } catch (SAXException ex) { - throw new DroidsException("Failure parsing document " + task.getId(), ex); - } catch (TikaException ex) { - throw new DroidsException("Failure parsing document " + task.getId(), ex); - } finally { - instream.close(); - } - } + return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata); + } catch (SAXException ex) { + throw new DroidsException("Failure parsing document " + task.getId(), ex); + } catch (TikaException ex) { + throw new DroidsException("Failure parsing document " + task.getId(), ex); + } finally { + instream.close(); + } + } } Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Tue Dec 18 08:47:39 2012 @@ -50,76 +50,74 @@ import org.apache.tika.sax.TeeContentHan import org.xml.sax.SAXException; /** - * * @deprecated Use TikaDocumentParser instead as it handles HTML just fine and performs the same operations. - * */ @Deprecated public class TikaHtmlParser implements TikaParser { - protected static final Log log = LogFactory.getLog(TikaHtmlParser.class); + protected static final Log log = LogFactory.getLog(TikaHtmlParser.class); - @Override - public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException { - // Init Tika objects - org.apache.tika.parser.Parser parser = new AutoDetectParser(); - Metadata metadata = new Metadata(); - - String charset = entity.getCharset(); - if (charset == null) { - charset = "UTF-8"; - } - - StringWriter dataBuffer = new StringWriter(); - StringWriter bodyBuffer = new StringWriter(); - StringWriter mainContentBuffer = new StringWriter(); - - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); - TransformerHandler xmlHandler; - try { - xmlHandler = factory.newTransformerHandler(); - } catch (TransformerConfigurationException e) { - throw new DroidsException(e); + @Override + public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException { + // Init Tika objects + org.apache.tika.parser.Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + + String charset = entity.getCharset(); + if (charset == null) { + charset = "UTF-8"; + } + + StringWriter dataBuffer = new StringWriter(); + StringWriter bodyBuffer = new StringWriter(); + StringWriter mainContentBuffer = new StringWriter(); + + SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + TransformerHandler xmlHandler; + try { + xmlHandler = factory.newTransformerHandler(); + } catch (TransformerConfigurationException e) { + throw new DroidsException(e); + } + xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + xmlHandler.setResult(new StreamResult(dataBuffer)); + + BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer); + BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer); + LinkContentHandler linkHandler = new LinkContentHandler(); + + TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler); + + InputStream instream = entity.obtainContent(); + try { + parser.parse(instream, parallelHandler, metadata, new ParseContext()); + + ArrayList<Link> extractedTasks = new ArrayList<Link>(); + if (task instanceof Link) { + int depth = task.getDepth() + 1; + for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) { + try { + URI uri = new URI(tikaLink.getUri()); + // Test to see if the scheme is empty + // This would indicate a relative URL, so resolve it against the task URI + if (uri.getScheme() == null) { + uri = ((Link) task).getURI().resolve(uri); + } + extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText())); + } catch (URISyntaxException e) { + if (log.isWarnEnabled()) { + log.warn("URI not valid: " + tikaLink.getUri()); + } + } + } + } + return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata); + } catch (SAXException ex) { + throw new DroidsException("Failure parsing document " + task.getId(), ex); + } catch (TikaException ex) { + throw new DroidsException("Failure parsing document " + task.getId(), ex); + } finally { + instream.close(); + } } - xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); - xmlHandler.setResult(new StreamResult(dataBuffer)); - - BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer); - BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer); - LinkContentHandler linkHandler = new LinkContentHandler(); - - TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler ); - - InputStream instream = entity.obtainContent(); - try { - parser.parse(instream, parallelHandler, metadata, new ParseContext()); - - ArrayList<Link> extractedTasks = new ArrayList<Link>(); - if (task instanceof Link) { - int depth = task.getDepth() + 1; - for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) { - try { - URI uri = new URI(tikaLink.getUri()); - // Test to see if the scheme is empty - // This would indicate a relative URL, so resolve it against the task URI - if(uri.getScheme() == null) { - uri = ((Link) task).getURI().resolve(uri); - } - extractedTasks.add(new LinkTask((Link)task, uri, depth, tikaLink.getText())); - } catch (URISyntaxException e) { - if(log.isWarnEnabled()) { - log.warn("URI not valid: "+ tikaLink.getUri()); - } - } - } - } - return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata); - } catch (SAXException ex) { - throw new DroidsException("Failure parsing document " + task.getId(), ex); - } catch (TikaException ex) { - throw new DroidsException("Failure parsing document " + task.getId(), ex); - } finally { - instream.close(); - } - } } Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java Tue Dec 18 08:47:39 2012 @@ -21,36 +21,41 @@ import org.apache.tika.metadata.Metadata public interface TikaParse extends Parse { - /** - * Retrieves the main content of the parsed document. - * Uses Tika's plugin in for Boilerpipe. - * @return plain text result with boilerplate removed - */ - public String getMainContent(); - - /** - * Extracted meta data from the document. This can include - * meta tags from within an HTML document - * @return metadata object from the parse - */ - public Metadata getMetadata(); - - /** - * The HTML representation of the document. - * @return The HTML representation of the document. - */ - public String getXml(); - - /** - * Plain text representation of the document. - * @return plain text version without formatting - */ - public String getPlainText(); - - /** - * If the document should be indexed or not. - * This can be determined from metadata or other methods - * @return false if the document shouldn't be indexed, true otherwise - */ - public boolean isIndexed(); + /** + * Retrieves the main content of the parsed document. + * Uses Tika's plugin in for Boilerpipe. + * + * @return plain text result with boilerplate removed + */ + public String getMainContent(); + + /** + * Extracted meta data from the document. This can include + * meta tags from within an HTML document + * + * @return metadata object from the parse + */ + public Metadata getMetadata(); + + /** + * The HTML representation of the document. + * + * @return The HTML representation of the document. + */ + public String getXml(); + + /** + * Plain text representation of the document. + * + * @return plain text version without formatting + */ + public String getPlainText(); + + /** + * If the document should be indexed or not. + * This can be determined from metadata or other methods + * + * @return false if the document shouldn't be indexed, true otherwise + */ + public boolean isIndexed(); } Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java Tue Dec 18 08:47:39 2012 @@ -26,60 +26,60 @@ import org.apache.tika.metadata.Metadata public class TikaParseImpl extends ParseImpl implements TikaParse { - private String plainText; - private String mainContent; - private Metadata metadata; - - public TikaParseImpl(String text, Collection<Link> outlinks) { - super(text,outlinks); - } - - public TikaParseImpl(String text, Object data, Collection<Link> outlinks) { - super(text,data,outlinks); - } - - public TikaParseImpl(String xmlContent, ArrayList<Link> extractedTasks, - String plainText, String mainContent, Metadata metadata) { - this(xmlContent, extractedTasks); - this.plainText = plainText; - this.mainContent = mainContent; - this.metadata = metadata; - } - - @Override - public String getMainContent() { - return mainContent; - } - - @Override - public Metadata getMetadata() { - return metadata; - } - - @Override - public String getXml() { - return super.text; - } - - @Override - public String getPlainText() { - return plainText; - } - - @Override - public boolean isFollowed() { - if(metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("nofollow")) { - return false; - } - return true; - } - - @Override - public boolean isIndexed() { - if(metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("noindex")) { - return false; + private String plainText; + private String mainContent; + private Metadata metadata; + + public TikaParseImpl(String text, Collection<Link> outlinks) { + super(text, outlinks); + } + + public TikaParseImpl(String text, Object data, Collection<Link> outlinks) { + super(text, data, outlinks); + } + + public TikaParseImpl(String xmlContent, ArrayList<Link> extractedTasks, + String plainText, String mainContent, Metadata metadata) { + this(xmlContent, extractedTasks); + this.plainText = plainText; + this.mainContent = mainContent; + this.metadata = metadata; + } + + @Override + public String getMainContent() { + return mainContent; + } + + @Override + public Metadata getMetadata() { + return metadata; + } + + @Override + public String getXml() { + return super.text; + } + + @Override + public String getPlainText() { + return plainText; + } + + @Override + public boolean isFollowed() { + if (metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("nofollow")) { + return false; + } + return true; + } + + @Override + public boolean isIndexed() { + if (metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("noindex")) { + return false; + } + return true; } - return true; - } } Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml Tue Dec 18 08:47:39 2012 @@ -16,14 +16,14 @@ limitations under the License. --> <project xmlns="http://maven.apache.org/DECORATION/1.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/DECORATION/1.0.0 http://maven.apache.org/xsd/decoration-1.0.0.xsd"> - <body> - <menu ref="parent" /> - - <menu name="JavaDocs"> - <item name="JavaDocs" href="apidocs/index.html"/> - <item name="Test JavaDocs" href="testapidocs/index.html"/> - </menu> - </body> + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/DECORATION/1.0.0 http://maven.apache.org/xsd/decoration-1.0.0.xsd"> + <body> + <menu ref="parent"/> + + <menu name="JavaDocs"> + <item name="JavaDocs" href="apidocs/index.html"/> + <item name="Test JavaDocs" href="testapidocs/index.html"/> + </menu> + </body> </project> \ No newline at end of file Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java?rev=1423339&r1=1423338&r2=1423339&view=diff ============================================================================== --- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (original) +++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java Tue Dec 18 08:47:39 2012 @@ -19,10 +19,9 @@ package org.apache.droids.tika; import junit.framework.TestCase; public class TikaHtmlParserTest extends TestCase { - - public void testSomething() throws Exception - { - // TODO -- test stuff! - assertTrue( true ); - } + + public void testSomething() throws Exception { + // TODO -- test stuff! + assertTrue(true); + } } \ No newline at end of file
