Author: jukka
Date: Sat Dec 28 01:14:27 2013
New Revision: 1553774

URL: http://svn.apache.org/r1553774
Log:
TIKA-1193: Allow access to HtmlParser's HtmlSchema

Patch by Markus Jelsma

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1553774&r1=1553773&r2=1553774&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Sat Dec 28 01:14:27 2013
@@ -92,9 +92,12 @@ public class HtmlParser extends Abstract
             org.ccil.cowan.tagsoup.Parser parser =
                     new org.ccil.cowan.tagsoup.Parser();
 
+            // Use schema from context or default
+            Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
             // TIKA-528: Reuse share schema to avoid heavy instantiation
             parser.setProperty(
-                    org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
+                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
             // TIKA-599: Shared schema is thread-safe only if bogons are 
ignored
             parser.setFeature(
                     org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1553774&r1=1553773&r2=1553774&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Sat Dec 28 01:14:27 2013
@@ -42,7 +42,10 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.Attributes;
@@ -890,4 +893,36 @@ public class HtmlParserTest {
           assertTrue("testing: " +fileName, content.contains(hit));
        }
     }
+
+    // TIKA-1193
+    @Test
+    public void testCustomHtmlSchema() throws Exception {
+        // Default schema does not allow tables inside anchors
+        String test = 
"<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
+
+        Metadata metadata = new Metadata();
+        LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
+                linkContentHandler, metadata, new ParseContext());
+
+        // Expect no anchor text
+        assertEquals("", linkContentHandler.getLinks().get(0).getText());
+
+        // We'll change the schema to allow tables inside anchors!
+        Schema schema = new HTMLSchema();
+        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(Schema.class, schema);
+        linkContentHandler = new LinkContentHandler();
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
+                linkContentHandler, metadata, parseContext);
+
+        // Expect anchor text
+        assertEquals("\ttext\n\n", 
linkContentHandler.getLinks().get(0).getText());
+    }
+
 }


Reply via email to