Author: jukka
Date: Sun Dec 13 21:03:02 2009
New Revision: 890117

URL: http://svn.apache.org/viewvc?rev=890117&view=rev
Log:
TIKA-347: Make HtmlParser customizable through ParseContext

Added:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=890117&r1=890116&r2=890117&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Sun Dec 13 21:03:02 2009
@@ -28,7 +28,7 @@
 
 class HtmlHandler extends TextContentHandler {
 
-    private final HtmlParser parser;
+    private final HtmlMapper mapper;
 
     private final XHTMLContentHandler xhtml;
 
@@ -43,9 +43,9 @@
     private final StringBuilder title = new StringBuilder();
 
     private HtmlHandler(
-            HtmlParser parser, XHTMLContentHandler xhtml, Metadata metadata) {
+            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
         super(xhtml);
-        this.parser = parser;
+        this.mapper = mapper;
         this.xhtml = xhtml;
         this.metadata = metadata;
 
@@ -65,8 +65,8 @@
     }
 
     public HtmlHandler(
-            HtmlParser parser, ContentHandler handler, Metadata metadata) {
-        this(parser, new XHTMLContentHandler(handler, metadata), metadata);
+            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
     }
 
     @Override
@@ -79,7 +79,7 @@
         if ("BODY".equals(name) || bodyLevel > 0) {
             bodyLevel++;
         }
-        if (parser.isDiscardElement(name) || discardLevel > 0) {
+        if (mapper.isDiscardElement(name) || discardLevel > 0) {
             discardLevel++;
         }
 
@@ -103,7 +103,7 @@
         }
 
         if (bodyLevel > 0 && discardLevel == 0) {
-            String safe = parser.mapSafeElement(name);
+            String safe = mapper.mapSafeElement(name);
             if (safe != null) {
                 xhtml.startElement(safe);
             } else if ("A".equals(name)) {
@@ -128,7 +128,7 @@
     public void endElement(
             String uri, String local, String name) throws SAXException {
         if (bodyLevel > 0 && discardLevel == 0) {
-            String safe = parser.mapSafeElement(name);
+            String safe = mapper.mapSafeElement(name);
             if (safe != null) {
                 xhtml.endElement(safe);
             } else if ("A".equals(name)) {

Added: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=890117&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
 (added)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
 Sun Dec 13 21:03:02 2009
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {...@link HtmlParser} looks up an optional HTML mapper 
from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {...@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     *         <code>null</code> if the element is unsafe 
+     */
+    String mapSafeElement(String name);
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     *         should be ignored, <code>false</code> otherwise
+     */
+    boolean isDiscardElement(String name);
+
+}

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890117&r1=890116&r2=890117&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Sun Dec 13 21:03:02 2009
@@ -148,11 +148,15 @@
         InputSource source = new InputSource(stream); 
         source.setEncoding(getEncoding(stream, metadata));
 
+        // Get the HTML mapper from the parse context
+        HtmlMapper mapper =
+            context.get(HtmlMapper.class, new HtmlParserMapper());
+
         // Parse the HTML document
         org.ccil.cowan.tagsoup.Parser parser =
             new org.ccil.cowan.tagsoup.Parser();
         parser.setContentHandler(new XHTMLDowngradeHandler(
-                new HtmlHandler(this, handler, metadata)));
+                new HtmlHandler(mapper, handler, metadata)));
         parser.parse(source);
     }
 
@@ -226,4 +230,21 @@
         return "STYLE".equals(name) || "SCRIPT".equals(name);
     }
 
+    /**
+     * Adapter class that maintains backwards compatibility with the
+     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+     * directly would require those methods to be public, which would break
+     * backwards compatibility with subclasses.
+     * <p>
+     * TODO: Cleanup in Tika 1.0
+     */
+    private class HtmlParserMapper implements HtmlMapper {
+        public String mapSafeElement(String name) {
+            return HtmlParser.this.mapSafeElement(name);
+        }
+        public boolean isDiscardElement(String name) {
+            return HtmlParser.this.isDiscardElement(name);
+        }
+    }
+
 }


Reply via email to