Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +public class DcXMLParserTest extends TikaTest { + + @Test + public void testXMLParserAsciiChars() throws Exception { + try (InputStream input = DcXMLParserTest.class.getResourceAsStream( + "/test-documents/testXML.xml")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DcXMLParser().parse(input, handler, metadata); + + assertEquals( + "application/xml", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR)); + + // The file contains 5 dc:subject tags, which come through as + // a multi-valued Tika Metadata entry in file order + assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS)); + assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length); + assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]); + assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]); + assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]); + assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]); + assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]); + assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT)); + assertEquals(5, metadata.getValues(Metadata.SUBJECT).length); + assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]); + assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]); + assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]); + assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]); + assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]); + + assertEquals( + "Framework d\'indexation des documents XML, HTML, PDF etc..", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals( + "http://www.apache.org", + metadata.get(TikaCoreProperties.IDENTIFIER)); + assertEquals("test", metadata.get(TikaCoreProperties.TYPE)); + assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT)); + assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE)); + assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars")); + + String content = handler.toString(); + assertContains("Tika test document", content); + + assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED)); + } + } + + @Test + public void testXMLParserNonAsciiChars() throws Exception { + try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) { + Metadata metadata = new Metadata(); + new DcXMLParser().parse(input, new DefaultHandler(), metadata); + + final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9"; + assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS)); + } + } + + // TIKA-1048 + @Test + public void testNoSpaces() throws Exception { + String text = getXML("testXML2.xml").xml; + assertFalse(text.contains("testSubject")); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest { + + private Property FIRST_NAME = Property.internalTextBag( + "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName"); + private Property LAST_NAME = Property.internalTextBag( + "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName"); + + @Test + public void testDefaultBehavior() throws Exception { + try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream( + "/test-documents/testXML3.xml")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals(4, metadata.getValues(FIRST_NAME).length); + assertEquals(2, metadata.getValues(LAST_NAME).length); + + assertEquals("John", metadata.getValues(FIRST_NAME)[0]); + assertEquals("Smith", metadata.getValues(LAST_NAME)[0]); + + assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]); + assertEquals("Doe", metadata.getValues(LAST_NAME)[1]); + + // We didn't know Bob's last name, but now we don't know an entry existed + assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]); + + // We don't know Kate's last name because it was a duplicate + assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]); + } + } + + @Test + public void testEmptiesAndRepeats() throws Exception { + try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream( + "/test-documents/testXML3.xml")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals(4, metadata.getValues(FIRST_NAME).length); + assertEquals(4, metadata.getValues(LAST_NAME).length); + + assertEquals("John", metadata.getValues(FIRST_NAME)[0]); + assertEquals("Smith", metadata.getValues(LAST_NAME)[0]); + + assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]); + assertEquals("Doe", metadata.getValues(LAST_NAME)[1]); + + assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]); + assertEquals("", metadata.getValues(LAST_NAME)[2]); + + assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]); + assertEquals("Smith", metadata.getValues(LAST_NAME)[3]); + } + } + + private class DefaultCustomXMLTestParser extends XMLParser { + + private static final long serialVersionUID = 2458579047014545931L; + + protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) { + return new ElementMetadataHandler( + "http://custom", + localPart, + metadata, + tikaProperty); + } + + protected ContentHandler getContentHandler( + ContentHandler handler, Metadata metadata, ParseContext context) { + return new TeeContentHandler( + super.getContentHandler(handler, metadata, context), + getCustomElementHandler(metadata, FIRST_NAME, "FirstName"), + getCustomElementHandler(metadata, LAST_NAME, "LastName")); + } + } + + private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser { + + private static final long serialVersionUID = 3735646809954466229L; + + protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) { + return new ElementMetadataHandler( + "http://custom", + localPart, + metadata, + tikaProperty, + true, + true); + } + } + + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.TikaTest.TrackingHandler; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class FictionBookParserTest { + + @Test + public void testFB2() throws Exception { + try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new FictionBookParser().parse(input, handler, metadata, new ParseContext()); + String content = handler.toString(); + + assertContains("1812", content); + } + } + + @Test + public void testEmbedded() throws Exception { + try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) { + ContainerExtractor extractor = new ParserContainerExtractor(); + TikaInputStream stream = TikaInputStream.get(input); + + assertEquals(true, extractor.isSupported(stream)); + + // Process it + TrackingHandler handler = new TrackingHandler(); + extractor.extract(stream, null, handler); + + assertEquals(2, handler.filenames.size()); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml Wed Jan 6 03:50:50 2016 @@ -0,0 +1,102 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-web-module</artifactId> + <name>Apache Tika Web Module</name> + <url>http://tika.apache.org/</url> + + <properties> + <mime4j.version>0.7.2</mime4j.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.ccil.cowan.tagsoup</groupId> + <artifactId>tagsoup</artifactId> + <version>1.2.1</version> + </dependency> + <dependency> + <groupId>de.l3s.boilerpipe</groupId> + <artifactId>boilerpipe</artifactId> + <version>1.1.0</version> + </dependency> + <dependency> + <groupId>rome</groupId> + <artifactId>rome</artifactId> + <version>1.0</version> + </dependency> + <dependency> + <groupId>org.apache.james</groupId> + <artifactId>apache-mime4j-core</artifactId> + <version>${mime4j.version}</version> + </dependency> + <dependency> + <groupId>org.apache.james</groupId> + <artifactId>apache-mime4j-dom</artifactId> + <version>${mime4j.version}</version> + </dependency> + <dependency> + <groupId>com.pff</groupId> + <artifactId>java-libpst</artifactId> + <version>0.8.1</version> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-package-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.feed; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import com.sun.syndication.feed.synd.SyndContent; +import com.sun.syndication.feed.synd.SyndEntry; +import com.sun.syndication.feed.synd.SyndFeed; +import com.sun.syndication.io.FeedException; +import com.sun.syndication.io.SyndFeedInput; + +/** + * Feed parser. + * <p> + * Uses Rome for parsing the feeds. A feed description is put in a paragraph + * with its link and title in an anchor. + */ +public class FeedParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -3785361933034525186L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("rss+xml"), + MediaType.application("atom+xml")))); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // set the encoding? + try { + SyndFeed feed = new SyndFeedInput().build( + new InputSource(new CloseShieldInputStream(stream))); + + String title = stripTags(feed.getTitleEx()); + String description = stripTags(feed.getDescriptionEx()); + + metadata.set(TikaCoreProperties.TITLE, title); + metadata.set(TikaCoreProperties.DESCRIPTION, description); + // store the other fields in the metadata + + XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + xhtml.element("h1", title); + xhtml.element("p", description); + + xhtml.startElement("ul"); + for (Object e : feed.getEntries()) { + SyndEntry entry = (SyndEntry) e; + String link = entry.getLink(); + if (link != null) { + xhtml.startElement("li"); + xhtml.startElement("a", "href", link); + xhtml.characters(stripTags(entry.getTitleEx())); + xhtml.endElement("a"); + SyndContent content = entry.getDescription(); + if (content != null) { + xhtml.newline(); + xhtml.characters(stripTags(content)); + } + xhtml.endElement("li"); + } + } + xhtml.endElement("ul"); + + xhtml.endDocument(); + } catch (FeedException e) { + throw new TikaException("RSS parse error", e); + } + + } + + private static String stripTags(SyndContent c) { + if (c == null) + return ""; + + String value = c.getValue(); + + String[] parts = value.split("<[^>]*>"); + StringBuffer buf = new StringBuffer(); + + for (String part : parts) + buf.append(part); + + return buf.toString().trim(); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.io.Writer; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; +import java.util.Locale; + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.extractors.ArticleExtractor; +import de.l3s.boilerpipe.extractors.DefaultExtractor; +import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a> + * library to automatically extract the main content from a web page. + * <p/> + * Use this as a {@link ContentHandler} object passed to + * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)} + */ +public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler { + + /** + * The newline character that gets inserted after block elements. + */ + private static final char[] NL = new char[]{'\n'}; + private ContentHandler delegate; + private BoilerpipeExtractor extractor; + private boolean includeMarkup; + private boolean inHeader; + private boolean inFooter; + private int headerCharOffset; + private List<RecordedElement> elements; + private TextDocument td; + /** + * Creates a new boilerpipe-based content extractor, using the + * {@link DefaultExtractor} extraction rules and "delegate" as the content handler. + * + * @param delegate The {@link ContentHandler} object + */ + public BoilerpipeContentHandler(ContentHandler delegate) { + this(delegate, DefaultExtractor.INSTANCE); + } + + /** + * Creates a content handler that writes XHTML body character events to + * the given writer. + * + * @param writer writer + */ + public BoilerpipeContentHandler(Writer writer) { + this(new WriteOutContentHandler(writer)); + } + + /** + * Creates a new boilerpipe-based content extractor, using the given + * extraction rules. The extracted main content will be passed to the + * <delegate> content handler. + * + * @param delegate The {@link ContentHandler} object + * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor} + */ + public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) { + this.td = null; + this.delegate = delegate; + this.extractor = extractor; + } + + public boolean isIncludeMarkup() { + return includeMarkup; + } + + public void setIncludeMarkup(boolean includeMarkup) { + this.includeMarkup = includeMarkup; + } + + /** + * Retrieves the built TextDocument + * + * @return TextDocument + */ + public TextDocument getTextDocument() { + return td; + } + + @Override + public void startDocument() throws SAXException { + super.startDocument(); + + delegate.startDocument(); + + inHeader = true; + inFooter = false; + headerCharOffset = 0; + + if (includeMarkup) { + elements = new ArrayList<RecordedElement>(); + } + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + super.startPrefixMapping(prefix, uri); + delegate.startPrefixMapping(prefix, uri); + } + + ; + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + super.startElement(uri, localName, qName, atts); + + if (inHeader) { + delegate.startElement(uri, localName, qName, atts); + } else if (inFooter) { + // Do nothing + } else if (includeMarkup) { + elements.add(new RecordedElement(uri, localName, qName, atts)); + } else { + // This happens for the <body> element, if we're not doing markup. + delegate.startElement(uri, localName, qName, atts); + } + } + + ; + + @Override + public void characters(char[] chars, int offset, int length) throws SAXException { + super.characters(chars, offset, length); + + if (inHeader) { + delegate.characters(chars, offset, length); + headerCharOffset++; + } else if (inFooter) { + // Do nothing + } else if (includeMarkup) { + RecordedElement element = elements.get(elements.size() - 1); + + char[] characters = new char[length]; + System.arraycopy(chars, offset, characters, 0, length); + element.getCharacters().add(characters); + } + } + + ; + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + super.endElement(uri, localName, qName); + + if (inHeader) { + delegate.endElement(uri, localName, qName); + inHeader = !localName.equals("head"); + } else if (inFooter) { + // Do nothing + } else if (localName.equals("body")) { + inFooter = true; + } else if (includeMarkup) { + // Add the end element, and the continuation from the previous element + elements.add(new RecordedElement(uri, localName, qName)); + elements.add(new RecordedElement()); + } + } + + ; + + @Override + public void endDocument() throws SAXException { + super.endDocument(); + + td = toTextDocument(); + try { + extractor.process(td); + } catch (BoilerpipeProcessingException e) { + throw new SAXException(e); + } + + Attributes emptyAttrs = new AttributesImpl(); + + // At this point we have all the information we need to either emit N paragraphs + // of plain text (if not including markup), or we have to replay our recorded elements + // and only emit character runs that passed the boilerpipe filters. + if (includeMarkup) { + BitSet validCharacterRuns = new BitSet(); + for (TextBlock block : td.getTextBlocks()) { + if (block.isContent()) { + BitSet bs = block.getContainedTextElements(); + if (bs != null) { + validCharacterRuns.or(bs); + } + } + } + + // Now have bits set for all valid character runs. Replay our recorded elements, + // but only emit character runs flagged as valid. + int curCharsIndex = headerCharOffset; + + for (RecordedElement element : elements) { + switch (element.getElementType()) { + case START: + delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs()); + // Fall through + + case CONTINUE: + // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so + // we have to follow suit. + for (char[] chars : element.getCharacters()) { + curCharsIndex++; + + if (validCharacterRuns.get(curCharsIndex)) { + delegate.characters(chars, 0, chars.length); + + // https://issues.apache.org/jira/browse/TIKA-961 + if (!Character.isWhitespace(chars[chars.length - 1])) { + // Only add whitespace for certain elements + if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) { + delegate.ignorableWhitespace(NL, 0, NL.length); + } + } + } + } + break; + + case END: + delegate.endElement(element.getUri(), element.getLocalName(), element.getQName()); + break; + + default: + throw new RuntimeException("Unhandled element type: " + element.getElementType()); + } + + + } + } else { + for (TextBlock block : td.getTextBlocks()) { + if (block.isContent()) { + delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs); + char[] chars = block.getText().toCharArray(); + delegate.characters(chars, 0, chars.length); + delegate.endElement(XHTMLContentHandler.XHTML, "p", "p"); + delegate.ignorableWhitespace(NL, 0, NL.length); + } + } + } + + delegate.endElement(XHTMLContentHandler.XHTML, "body", "body"); + delegate.endElement(XHTMLContentHandler.XHTML, "html", "html"); + + // We defer ending any prefix mapping until here, which is why we don't pass this + // through to the delegate in an overridden method. + delegate.endPrefixMapping(""); + + delegate.endDocument(); + } + + ; + + private static class RecordedElement { + private String uri; + private String localName; + private String qName; + private Attributes attrs; + private List<char[]> characters; + private ElementType elementType; + public RecordedElement(String uri, String localName, String qName, Attributes attrs) { + this(uri, localName, qName, attrs, ElementType.START); + } + + public RecordedElement(String uri, String localName, String qName) { + this(uri, localName, qName, null, ElementType.END); + } + + public RecordedElement() { + this(null, null, null, null, ElementType.CONTINUE); + } + + protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) { + this.uri = uri; + this.localName = localName; + this.qName = qName; + this.attrs = attrs; + this.elementType = elementType; + this.characters = new ArrayList<char[]>(); + } + + @Override + public String toString() { + return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType); + } + + public String getUri() { + return uri; + } + + public String getLocalName() { + return localName; + } + + public String getQName() { + return qName; + } + + public Attributes getAttrs() { + return attrs; + } + + public List<char[]> getCharacters() { + return characters; + } + + public RecordedElement.ElementType getElementType() { + return elementType; + } + + public enum ElementType { + START, + END, + CONTINUE + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * The default HTML mapping rules in Tika. + * + * @since Apache Tika 0.6 + */ +@SuppressWarnings("serial") +public class DefaultHtmlMapper implements HtmlMapper { + + /** + * @since Apache Tika 0.8 + */ + public static final HtmlMapper INSTANCE = new DefaultHtmlMapper(); + // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd + private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{ + put("H1", "h1"); + put("H2", "h2"); + put("H3", "h3"); + put("H4", "h4"); + put("H5", "h5"); + put("H6", "h6"); + + put("P", "p"); + put("PRE", "pre"); + put("BLOCKQUOTE", "blockquote"); + put("Q", "q"); + + put("UL", "ul"); + put("OL", "ol"); + put("MENU", "ul"); + put("LI", "li"); + put("DL", "dl"); + put("DT", "dt"); + put("DD", "dd"); + + put("TABLE", "table"); + put("THEAD", "thead"); + put("TBODY", "tbody"); + put("TR", "tr"); + put("TH", "th"); + put("TD", "td"); + + put("ADDRESS", "address"); + + // TIKA-460 - add anchors + put("A", "a"); + + // TIKA-463 - add additional elements that contain URLs (and their sub-elements) + put("MAP", "map"); + put("AREA", "area"); + put("IMG", "img"); + put("FRAMESET", "frameset"); + put("FRAME", "frame"); + put("IFRAME", "iframe"); + put("OBJECT", "object"); + put("PARAM", "param"); + put("INS", "ins"); + put("DEL", "del"); + }}; + private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{ + add("STYLE"); + add("SCRIPT"); + }}; + // For information on tags & attributes, see: + // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict + // http://www.w3schools.com/TAGS/ + private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{ + put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords")); + put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap")); + put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling")); + put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width")); + put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media")); + put("map", attrSet("id", "class", "style", "title", "name")); + put("area", attrSet("shape", "coords", "href", "nohref", "alt")); + put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", + "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace")); + put("param", attrSet("id", "name", "value", "valuetype", "type")); + put("blockquote", attrSet("cite")); + put("ins", attrSet("cite", "datetime")); + put("del", attrSet("cite", "datetime")); + put("q", attrSet("cite")); + + // TODO - fill out this set. Include core, i18n, etc sets where appropriate. + }}; + + private static Set<String> attrSet(String... attrs) { + Set<String> result = new HashSet<String>(); + for (String attr : attrs) { + result.add(attr); + } + return result; + } + + public String mapSafeElement(String name) { + return SAFE_ELEMENTS.get(name); + } + + /** + * Normalizes an attribute name. Assumes that the element name + * is valid and normalized + */ + public String mapSafeAttribute(String elementName, String attributeName) { + Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName); + if ((safeAttrs != null) && safeAttrs.contains(attributeName)) { + return attributeName; + } else { + return null; + } + } + + public boolean isDiscardElement(String name) { + return DISCARDABLE_ELEMENTS.contains(name); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.utils.CharsetUtils; + +/** + * Character encoding detector for determining the character encoding of a + * HTML document based on the potential charset parameter found in a + * Content-Type http-equiv meta tag somewhere near the beginning. Especially + * useful for determining the type among multiple closely related encodings + * (ISO-8859-*) for which other types of encoding detection are unreliable. + * + * @since Apache Tika 1.2 + */ +public class HtmlEncodingDetector implements EncodingDetector { + + // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K) + private static final int META_TAG_BUFFER_SIZE = 8192; + + + private static final Pattern HTTP_META_PATTERN = Pattern.compile( + "(?is)<\\s*meta\\s+([^<>]+)" + ); + + //this should match both the older: + //<meta http-equiv="content-type" content="text/html; charset=xyz"/> + //and + //html5 <meta charset="xyz"> + //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm + //for the noisiness that one might encounter in charset attrs. + //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings + //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html + //For a more general "not" matcher, try: + //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)") + private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile( + ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)") + ); + + private static final Charset ASCII = Charset.forName("US-ASCII"); + + public Charset detect(InputStream input, Metadata metadata) + throws IOException { + if (input == null) { + return null; + } + + // Read enough of the text stream to capture possible meta tags + input.mark(META_TAG_BUFFER_SIZE); + byte[] buffer = new byte[META_TAG_BUFFER_SIZE]; + int n = 0; + int m = input.read(buffer); + while (m != -1 && n < buffer.length) { + n += m; + m = input.read(buffer, n, buffer.length - n); + } + input.reset(); + + // Interpret the head as ASCII and try to spot a meta tag with + // a possible character encoding hint + + String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString(); + + Matcher equiv = HTTP_META_PATTERN.matcher(head); + Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); + //iterate through meta tags + while (equiv.find()) { + String attrs = equiv.group(1); + charsetMatcher.reset(attrs); + //iterate through charset= and return the first match + //that is valid + while (charsetMatcher.find()) { + String candCharset = charsetMatcher.group(1); + if (CharsetUtils.isSupported(candCharset)) { + try { + return CharsetUtils.forName(candCharset); + } catch (Exception e) { + //ignore + } + } + } + } + return null; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.sax.TextContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +class HtmlHandler extends TextContentHandler { + + // List of attributes that need to be resolved. + private static final Set<String> URI_ATTRIBUTES = + new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite")); + private static final Pattern ICBM = + Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"); + private final HtmlMapper mapper; + private final XHTMLContentHandler xhtml; + private final Metadata metadata; + private final StringBuilder title = new StringBuilder(); + private int bodyLevel = 0; + private int discardLevel = 0; + private int titleLevel = 0; + private boolean isTitleSetToMetadata = false; + + private HtmlHandler( + HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) { + super(xhtml); + this.mapper = mapper; + this.xhtml = xhtml; + this.metadata = metadata; + + // Try to determine the default base URL, if one has not been given + if (metadata.get(Metadata.CONTENT_LOCATION) == null) { + String name = metadata.get(Metadata.RESOURCE_NAME_KEY); + if (name != null) { + name = name.trim(); + try { + new URL(name); // test URL format + metadata.set(Metadata.CONTENT_LOCATION, name); + } catch (MalformedURLException e) { + // The resource name is not a valid URL, ignore it + } + } + } + } + + public HtmlHandler( + HtmlMapper mapper, ContentHandler handler, Metadata metadata) { + this(mapper, new XHTMLContentHandler(handler, metadata), metadata); + } + + @Override + public void startElement( + String uri, String local, String name, Attributes atts) + throws SAXException { + if ("TITLE".equals(name) || titleLevel > 0) { + titleLevel++; + } + if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) { + bodyLevel++; + } + if (mapper.isDiscardElement(name) || discardLevel > 0) { + discardLevel++; + } + + if (bodyLevel == 0 && discardLevel == 0) { + if ("META".equals(name) && atts.getValue("content") != null) { + // TIKA-478: For cases where we have either a name or + // "http-equiv", assume that XHTMLContentHandler will emit + // these in the <head>, thus passing them through safely. + if (atts.getValue("http-equiv") != null) { + addHtmlMetadata( + atts.getValue("http-equiv"), + atts.getValue("content")); + } else if (atts.getValue("name") != null) { + // Record the meta tag in the metadata + addHtmlMetadata( + atts.getValue("name"), + atts.getValue("content")); + } else if (atts.getValue("property") != null) { + // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags + metadata.add( + atts.getValue("property"), + atts.getValue("content")); + } + } else if ("BASE".equals(name) && atts.getValue("href") != null) { + startElementWithSafeAttributes("base", atts); + xhtml.endElement("base"); + metadata.set( + Metadata.CONTENT_LOCATION, + resolve(atts.getValue("href"))); + } else if ("LINK".equals(name)) { + startElementWithSafeAttributes("link", atts); + xhtml.endElement("link"); + } + } + + if (bodyLevel > 0 && discardLevel == 0) { + String safe = mapper.mapSafeElement(name); + if (safe != null) { + startElementWithSafeAttributes(safe, atts); + } + } + + title.setLength(0); + } + + /** + * Adds a metadata setting from the HTML <head/> to the Tika metadata + * object. The name and value are normalized where possible. + */ + private void addHtmlMetadata(String name, String value) { + if (name == null || value == null) { + // ignore + } else if (name.equalsIgnoreCase("ICBM")) { + Matcher m = ICBM.matcher(value); + if (m.matches()) { + metadata.set("ICBM", m.group(1) + ", " + m.group(2)); + metadata.set(Metadata.LATITUDE, m.group(1)); + metadata.set(Metadata.LONGITUDE, m.group(2)); + } else { + metadata.set("ICBM", value); + } + } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) { + //don't overwrite Metadata.CONTENT_TYPE! + MediaType type = MediaType.parse(value); + if (type != null) { + metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString()); + } else { + metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value); + } + } else { + metadata.add(name, value); + } + } + + private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException { + if (atts.getLength() == 0) { + xhtml.startElement(name); + return; + } + + boolean isObject = name.equals("object"); + String codebase = null; + if (isObject) { + codebase = atts.getValue("", "codebase"); + if (codebase != null) { + codebase = resolve(codebase); + } else { + codebase = metadata.get(Metadata.CONTENT_LOCATION); + } + } + + AttributesImpl newAttributes = new AttributesImpl(atts); + for (int att = 0; att < newAttributes.getLength(); att++) { + String attrName = newAttributes.getLocalName(att); + String normAttrName = mapper.mapSafeAttribute(name, attrName); + if (normAttrName == null) { + newAttributes.removeAttribute(att); + att--; + } else { + // We have a remapped attribute name, so set it as it might have changed. + newAttributes.setLocalName(att, normAttrName); + + // And resolve relative links. Eventually this should be pushed + // into the HtmlMapper code. + if (URI_ATTRIBUTES.contains(normAttrName)) { + newAttributes.setValue(att, resolve(newAttributes.getValue(att))); + } else if (isObject && "codebase".equals(normAttrName)) { + newAttributes.setValue(att, codebase); + } else if (isObject + && ("data".equals(normAttrName) + || "classid".equals(normAttrName))) { + newAttributes.setValue( + att, + resolve(codebase, newAttributes.getValue(att))); + } + } + } + + if ("img".equals(name) && newAttributes.getValue("", "alt") == null) { + newAttributes.addAttribute("", "alt", "alt", "CDATA", ""); + } + + xhtml.startElement(name, newAttributes); + } + + @Override + public void endElement( + String uri, String local, String name) throws SAXException { + if (bodyLevel > 0 && discardLevel == 0) { + String safe = mapper.mapSafeElement(name); + if (safe != null) { + xhtml.endElement(safe); + } else if (XHTMLContentHandler.ENDLINE.contains( + name.toLowerCase(Locale.ENGLISH))) { + // TIKA-343: Replace closing block tags (and <br/>) with a + // newline unless the HtmlMapper above has already mapped + // them to something else + xhtml.newline(); + } + } + + if (titleLevel > 0) { + titleLevel--; + if (titleLevel == 0 && !isTitleSetToMetadata) { + metadata.set(TikaCoreProperties.TITLE, title.toString().trim()); + isTitleSetToMetadata = true; + } + } + if (bodyLevel > 0) { + bodyLevel--; + } + if (discardLevel > 0) { + discardLevel--; + } + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + if (titleLevel > 0 && bodyLevel == 0) { + title.append(ch, start, length); + } + if (bodyLevel > 0 && discardLevel == 0) { + super.characters(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + if (bodyLevel > 0 && discardLevel == 0) { + super.ignorableWhitespace(ch, start, length); + } + } + + private String resolve(String url) { + return resolve(metadata.get(Metadata.CONTENT_LOCATION), url); + } + + private String resolve(String base, String url) { + url = url.trim(); + + // Return the URL as-is if no base URL is available or if the URL + // matches a common non-hierarchical or pseudo URI prefix + String lower = url.toLowerCase(Locale.ENGLISH); + if (base == null + || lower.startsWith("urn:") + || lower.startsWith("mailto:") + || lower.startsWith("tel:") + || lower.startsWith("data:") + || lower.startsWith("javascript:") + || lower.startsWith("about:")) { + return url; + } + + try { + URL baseURL = new URL(base.trim()); + + // We need to handle one special case, where the relativeUrl is + // just a query string (like "?pid=1"), and the baseUrl doesn't + // end with a '/'. In that case, the URL class removes the last + // portion of the path, which we don't want. + String path = baseURL.getPath(); + if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) { + return new URL( + baseURL.getProtocol(), + baseURL.getHost(), baseURL.getPort(), + baseURL.getPath() + url).toExternalForm(); + } else { + return new URL(baseURL, url).toExternalForm(); + } + } catch (MalformedURLException e) { + // Unknown or broken format; just return the URL as received. + return url; + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +/** + * HTML mapper used to make incoming HTML documents easier to handle by + * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from + * the parse context and uses it to map parsed HTML to "safe" XHTML. A client + * that wants to customize this mapping can place a custom HtmlMapper instance + * into the parse context. + * + * @since Apache Tika 0.6 + */ +public interface HtmlMapper { + + /** + * Maps "safe" HTML element names to semantic XHTML equivalents. If the + * given element is unknown or deemed unsafe for inclusion in the parse + * output, then this method returns <code>null</code> and the element + * will be ignored but the content inside it is still processed. See + * the {@link #isDiscardElement(String)} method for a way to discard + * the entire contents of an element. + * + * @param name HTML element name (upper case) + * @return XHTML element name (lower case), or + * <code>null</code> if the element is unsafe + */ + String mapSafeElement(String name); + + /** + * Checks whether all content within the given HTML element should be + * discarded instead of including it in the parse output. + * + * @param name HTML element name (upper case) + * @return <code>true</code> if content inside the named element + * should be ignored, <code>false</code> otherwise + */ + boolean isDiscardElement(String name); + + + /** + * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the + * given attribute is unknown or deemed unsafe for inclusion in the parse + * output, then this method returns <code>null</code> and the attribute + * will be ignored. This method assumes that the element name + * is valid and normalised. + * + * @param elementName HTML element name (lower case) + * @param attributeName HTML attribute name (lower case) + * @return XHTML attribute name (lower case), or + * <code>null</code> if the element is unsafe + */ + String mapSafeAttribute(String elementName, String attributeName); + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.ccil.cowan.tagsoup.HTMLSchema; +import org.ccil.cowan.tagsoup.Schema; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, + * and post-processes the events to produce XHTML and metadata expected by + * Tika clients. + */ +public class HtmlParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = 7895315240498733128L; + + private static final MediaType XHTML = MediaType.application("xhtml+xml"); + private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml"); + private static final MediaType X_ASP = MediaType.application("x-asp"); + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.text("html"), + XHTML, + WAP_XHTML, + X_ASP))); + + private static final ServiceLoader LOADER = + new ServiceLoader(HtmlParser.class.getClassLoader()); + + /** + * HTML schema singleton used to amortise the heavy instantiation time. + */ + private static final Schema HTML_SCHEMA = new HTMLSchema(); + + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // Automatically detect the character encoding + try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), + metadata,context.get(ServiceLoader.class, LOADER))) { + Charset charset = reader.getCharset(); + String previous = metadata.get(Metadata.CONTENT_TYPE); + MediaType contentType = null; + if (previous == null || previous.startsWith("text/html")) { + contentType = new MediaType(MediaType.TEXT_HTML, charset); + } else if (previous.startsWith("application/xhtml+xml")) { + contentType = new MediaType(XHTML, charset); + } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) { + contentType = new MediaType(WAP_XHTML, charset); + } else if (previous.startsWith("application/x-asp")) { + contentType = new MediaType(X_ASP, charset); + } + if (contentType != null) { + metadata.set(Metadata.CONTENT_TYPE, contentType.toString()); + } + // deprecated, see TIKA-431 + metadata.set(Metadata.CONTENT_ENCODING, charset.name()); + + // Get the HTML mapper from the parse context + HtmlMapper mapper = + context.get(HtmlMapper.class, new HtmlParserMapper()); + + // Parse the HTML document + org.ccil.cowan.tagsoup.Parser parser = + new org.ccil.cowan.tagsoup.Parser(); + + // Use schema from context or default + Schema schema = context.get(Schema.class, HTML_SCHEMA); + + // TIKA-528: Reuse share schema to avoid heavy instantiation + parser.setProperty( + org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); + // TIKA-599: Shared schema is thread-safe only if bogons are ignored + parser.setFeature( + org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); + + parser.setContentHandler(new XHTMLDowngradeHandler( + new HtmlHandler(mapper, handler, metadata))); + + parser.parse(reader.asInputSource()); + } + } + + /** + * Maps "safe" HTML element names to semantic XHTML equivalents. If the + * given element is unknown or deemed unsafe for inclusion in the parse + * output, then this method returns <code>null</code> and the element + * will be ignored but the content inside it is still processed. See + * the {@link #isDiscardElement(String)} method for a way to discard + * the entire contents of an element. + * <p/> + * Subclasses can override this method to customize the default mapping. + * + * @param name HTML element name (upper case) + * @return XHTML element name (lower case), or + * <code>null</code> if the element is unsafe + * @since Apache Tika 0.5 + * @deprecated Use the {@link HtmlMapper} mechanism to customize + * the HTML mapping. This method will be removed in Tika 1.0. + */ + protected String mapSafeElement(String name) { + return DefaultHtmlMapper.INSTANCE.mapSafeElement(name); + } + + /** + * Checks whether all content within the given HTML element should be + * discarded instead of including it in the parse output. Subclasses + * can override this method to customize the set of discarded elements. + * + * @param name HTML element name (upper case) + * @return <code>true</code> if content inside the named element + * should be ignored, <code>false</code> otherwise + * @since Apache Tika 0.5 + * @deprecated Use the {@link HtmlMapper} mechanism to customize + * the HTML mapping. This method will be removed in Tika 1.0. + */ + protected boolean isDiscardElement(String name) { + return DefaultHtmlMapper.INSTANCE.isDiscardElement(name); + } + + /** + * @deprecated Use the {@link HtmlMapper} mechanism to customize + * the HTML mapping. This method will be removed in Tika 1.0. + */ + public String mapSafeAttribute(String elementName, String attributeName) { + return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName); + } + + /** + * Adapter class that maintains backwards compatibility with the + * protected HtmlParser methods. Making HtmlParser implement HtmlMapper + * directly would require those methods to be public, which would break + * backwards compatibility with subclasses. + * + * @deprecated Use the {@link HtmlMapper} mechanism to customize + * the HTML mapping. This class will be removed in Tika 1.0. + */ + private class HtmlParserMapper implements HtmlMapper { + public String mapSafeElement(String name) { + return HtmlParser.this.mapSafeElement(name); + } + + public boolean isDiscardElement(String name) { + return HtmlParser.this.isDiscardElement(name); + } + + public String mapSafeAttribute(String elementName, String attributeName) { + return HtmlParser.this.mapSafeAttribute(elementName, attributeName); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.util.Locale; + +/** + * Alternative HTML mapping rules that pass the input HTML as-is without any + * modifications. + * + * @since Apache Tika 0.8 + */ +public class IdentityHtmlMapper implements HtmlMapper { + + public static final HtmlMapper INSTANCE = new IdentityHtmlMapper(); + + public boolean isDiscardElement(String name) { + return false; + } + + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(Locale.ENGLISH); + } + + public String mapSafeElement(String name) { + return name.toLowerCase(Locale.ENGLISH); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import javax.xml.XMLConstants; +import java.util.Locale; + +import org.apache.tika.sax.ContentHandlerDecorator; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Content handler decorator that downgrades XHTML elements to + * old-style HTML elements before passing them on to the decorated + * content handler. This downgrading consists of dropping all namespaces + * (and namespaced attributes) and uppercasing all element names. + * Used by the {@link HtmlParser} to make all incoming HTML look the same. + */ +class XHTMLDowngradeHandler extends ContentHandlerDecorator { + + public XHTMLDowngradeHandler(ContentHandler handler) { + super(handler); + } + + @Override + public void startElement( + String uri, String localName, String name, Attributes atts) + throws SAXException { + String upper = localName.toUpperCase(Locale.ENGLISH); + + AttributesImpl attributes = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + String auri = atts.getURI(i); + String local = atts.getLocalName(i); + String qname = atts.getQName(i); + if (XMLConstants.NULL_NS_URI.equals(auri) + && !local.equals(XMLConstants.XMLNS_ATTRIBUTE) + && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) { + attributes.addAttribute( + auri, local, qname, atts.getType(i), atts.getValue(i)); + } + } + + super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes); + } + + @Override + public void endElement(String uri, String localName, String name) + throws SAXException { + String upper = localName.toUpperCase(Locale.ENGLISH); + super.endElement(XMLConstants.NULL_NS_URI, upper, upper); + } + + @Override + public void startPrefixMapping(String prefix, String uri) { + } + + @Override + public void endPrefixMapping(String prefix) { + } + +}
