Author: jukka
Date: Sun Dec 13 21:03:02 2009
New Revision: 890117
URL: http://svn.apache.org/viewvc?rev=890117&view=rev
Log:
TIKA-347: Make HtmlParser customizable through ParseContext
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=890117&r1=890116&r2=890117&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Sun Dec 13 21:03:02 2009
@@ -28,7 +28,7 @@
class HtmlHandler extends TextContentHandler {
- private final HtmlParser parser;
+ private final HtmlMapper mapper;
private final XHTMLContentHandler xhtml;
@@ -43,9 +43,9 @@
private final StringBuilder title = new StringBuilder();
private HtmlHandler(
- HtmlParser parser, XHTMLContentHandler xhtml, Metadata metadata) {
+ HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
super(xhtml);
- this.parser = parser;
+ this.mapper = mapper;
this.xhtml = xhtml;
this.metadata = metadata;
@@ -65,8 +65,8 @@
}
public HtmlHandler(
- HtmlParser parser, ContentHandler handler, Metadata metadata) {
- this(parser, new XHTMLContentHandler(handler, metadata), metadata);
+ HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+ this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
}
@Override
@@ -79,7 +79,7 @@
if ("BODY".equals(name) || bodyLevel > 0) {
bodyLevel++;
}
- if (parser.isDiscardElement(name) || discardLevel > 0) {
+ if (mapper.isDiscardElement(name) || discardLevel > 0) {
discardLevel++;
}
@@ -103,7 +103,7 @@
}
if (bodyLevel > 0 && discardLevel == 0) {
- String safe = parser.mapSafeElement(name);
+ String safe = mapper.mapSafeElement(name);
if (safe != null) {
xhtml.startElement(safe);
} else if ("A".equals(name)) {
@@ -128,7 +128,7 @@
public void endElement(
String uri, String local, String name) throws SAXException {
if (bodyLevel > 0 && discardLevel == 0) {
- String safe = parser.mapSafeElement(name);
+ String safe = mapper.mapSafeElement(name);
if (safe != null) {
xhtml.endElement(safe);
} else if ("A".equals(name)) {
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=890117&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
(added)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
Sun Dec 13 21:03:02 2009
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {...@link HtmlParser} looks up an optional HTML mapper
from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {...@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeElement(String name);
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ */
+ boolean isDiscardElement(String name);
+
+}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890117&r1=890116&r2=890117&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Sun Dec 13 21:03:02 2009
@@ -148,11 +148,15 @@
InputSource source = new InputSource(stream);
source.setEncoding(getEncoding(stream, metadata));
+ // Get the HTML mapper from the parse context
+ HtmlMapper mapper =
+ context.get(HtmlMapper.class, new HtmlParserMapper());
+
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
parser.setContentHandler(new XHTMLDowngradeHandler(
- new HtmlHandler(this, handler, metadata)));
+ new HtmlHandler(mapper, handler, metadata)));
parser.parse(source);
}
@@ -226,4 +230,21 @@
return "STYLE".equals(name) || "SCRIPT".equals(name);
}
+ /**
+ * Adapter class that maintains backwards compatibility with the
+ * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+ * directly would require those methods to be public, which would break
+ * backwards compatibility with subclasses.
+ * <p>
+ * TODO: Cleanup in Tika 1.0
+ */
+ private class HtmlParserMapper implements HtmlMapper {
+ public String mapSafeElement(String name) {
+ return HtmlParser.this.mapSafeElement(name);
+ }
+ public boolean isDiscardElement(String name) {
+ return HtmlParser.this.isDiscardElement(name);
+ }
+ }
+
}