Author: jukka
Date: Thu Sep  4 10:26:47 2008
New Revision: 692168

URL: http://svn.apache.org/viewvc?rev=692168&view=rev
Log:
TIKA-149: Parser for zip files 

Include the structured XHTML <body/> content of the zip entries in the output 
document.

Needed to modify BodyContentHandler and MatchingContentHandler to make this 
work. I believe the modifications made both classes better and should cause no 
backwards compatibility issues.

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java 
Thu Sep  4 10:26:47 2008
@@ -87,9 +87,10 @@
         try {
             Metadata metadata = new Metadata();
             metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-            ContentHandler content = new BodyContentHandler();
-            getParser().parse(new CloseShieldInputStream(stream), content, 
metadata);
-            xhtml.element("p", content.toString());
+            getParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new BodyContentHandler(xhtml),
+                    metadata);
         } catch (TikaException e) {
             // Could not parse the entry, just skip the content
         }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java 
Thu Sep  4 10:26:47 2008
@@ -25,8 +25,9 @@
 import org.xml.sax.ContentHandler;
 
 /**
- * Content handler decorator that only passes the XHTML &lt;body/&gt;
- * tag and everything inside it to the underlying handler.
+ * Content handler decorator that only passes everything inside
+ * the XHTML &lt;body/&gt; tag to the underlying handler. Note that
+ * the &lt;body/&gt; tag itself is <em>not</em> passed on.
  */
 public class BodyContentHandler extends ContentHandlerDecorator {
 
@@ -40,7 +41,7 @@
      * The XPath matcher used to select the XHTML body contents.
      */
     private static final Matcher MATCHER =
-        PARSER.parse("/xhtml:html/xhtml:body//node()");
+        PARSER.parse("/xhtml:html/xhtml:body/*//node()");
 
     /**
      * Creates a content handler that passes all XHTML body events to the

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
 Thu Sep  4 10:26:47 2008
@@ -26,7 +26,9 @@
 
 /**
  * Content handler decorator that only passes the elements, attributes,
- * and text nodes that match the given XPath expression.
+ * and text nodes that match the given XPath expression. Note especially
+ * that [EMAIL PROTECTED] #startDocument()} and [EMAIL PROTECTED] 
#endDocument()} events are not
+ * passed to the decorated handler.
  */
 public class MatchingContentHandler extends ContentHandlerDecorator {
 
@@ -100,4 +102,16 @@
         }
     }
 
+    /**
+     * Ignored.
+     */
+    public void startDocument() {
+    }
+
+    /**
+     * Ignored.
+     */
+    public void endDocument() {
+    }
+
 }


Reply via email to