Author: jukka
Date: Fri Nov  1 02:45:43 2013
New Revision: 1537803

URL: http://svn.apache.org/r1537803
Log:
TIKA-817: (PPT/PPTX) Missing date/time in text content

Fix test failure on Windows where Java's built-in XML serializer would produce 
different line endings than on Mac/Unix environments

Modified:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java 
Fri Nov  1 02:45:43 2013
@@ -126,6 +126,13 @@ public class ToXMLContentHandler extends
     @Override
     public void startPrefixMapping(String prefix, String uri)
             throws SAXException {
+        try {
+            if (currentElement != null
+                    && prefix.equals(currentElement.getPrefix(uri))) {
+                return;
+            }
+        } catch (SAXException ignore) {
+        }
         namespaces.put(uri, prefix);
     }
 
@@ -182,7 +189,7 @@ public class ToXMLContentHandler extends
         }
 
         namespaces.clear();
-        
+
         // Reset the position in the tree, to avoid endless stack overflow
         // chains (see TIKA-1070)
         currentElement = currentElement.parent;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java 
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Nov 
 1 02:45:43 2013
@@ -18,13 +18,8 @@ package org.apache.tika;
 
 import java.io.File;
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.net.URISyntaxException;
 import java.net.URL;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
 
 import junit.framework.TestCase;
 
@@ -32,6 +27,8 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
 
 /**
  * Parent class of Tika tests
@@ -89,22 +86,15 @@ public abstract class TikaTest extends T
         InputStream input = null;
         Metadata metadata = new Metadata();
         Parser parser = new AutoDetectParser();
-        
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                 SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
-        handler.setResult(new StreamResult(sw));
 
         ParseContext context = new ParseContext();
         context.set(Parser.class, parser);
 
         input = getResourceAsStream("/test-documents/" + filePath);
         try {
+            ContentHandler handler = new ToXMLContentHandler();
             parser.parse(input, handler, metadata, context);
-            return new XMLResult(sw.toString(), metadata);
+            return new XMLResult(handler.toString(), metadata);
         } finally {
             input.close();
         }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Fri Nov  1 02:45:43 2013
@@ -229,8 +229,8 @@ public class PowerPointParserTest extend
     // TIKA-1025
     public void testEmbeddedPlacedholder() throws Exception {
        XMLResult result = getXML("testPPT_embedded2.ppt");
-       assertContains("<div class=\"embedded\" id=\"1\"/>", result.xml);
-       assertContains("<div class=\"embedded\" id=\"14\"/>", result.xml);
+       assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+       assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
     }
 
     // TIKA-817
@@ -239,7 +239,8 @@ public class PowerPointParserTest extend
        //autodate automatically.  For pptx, where value is stored,
        //value is extracted.  For ppt, however, no date is extracted.
        XMLResult result = getXML("testPPT_autodate.ppt");
-       assertContains("<p class=\"slide-content\">Now<br/>\n"+
-          "*<br/>\n*<br/>", result.xml);
+       assertContains(
+               "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
+               result.xml);
     }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Fri Nov  1 02:45:43 2013
@@ -141,7 +141,7 @@ public class WordParserTest extends Tika
         // is the pdf file" and before "Bye Bye":
         int i = result.indexOf("Here is the pdf file:");
         assertTrue(i != -1);
-        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\"/>");
+        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" 
/>");
         assertTrue(j != -1);
         int k = result.indexOf("Bye Bye");
         assertTrue(k != -1);
@@ -153,14 +153,14 @@ public class WordParserTest extends Tika
     // TIKA-982
     public void testEmbeddedRTF() throws Exception {
         String result = getXML("testWORD_embedded_rtf.doc").xml;
-        assertTrue(result.indexOf("<div class=\"embedded\" 
id=\"_1404039792\"/>") != -1);
+        assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1404039792\" 
/>") != -1);
         assertTrue(result.indexOf("_1404039792.rtf") != -1);
     }
 
     // TIKA-1019
     public void testDocumentLink() throws Exception {
         String result = getXML("testDocumentLink.doc").xml;
-        assertTrue(result.indexOf("<div class=\"embedded\" 
id=\"_1327495610\"/>") != -1);
+        assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\" 
/>") != -1);
         assertTrue(result.indexOf("_1327495610.unknown") != -1);
     }
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Fri Nov  1 02:45:43 2013
@@ -390,7 +390,7 @@ public class OOXMLParserTest extends Tik
       assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
       assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
       // Headings with anchor tags in them
-      assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a 
name=\"OnLevel3\"/>Heading Level 3</h3>"));
+      assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 
3</h3>"));
       // Bold and italic
       assertTrue(xml.contains("<b>BOLD</b>"));
       assertTrue(xml.contains("<i>ITALIC</i>"));
@@ -408,9 +408,9 @@ public class OOXMLParserTest extends Tik
       xml = result.xml;
 
       // Images 2-4 (there is no 1!)
-      assertTrue("Image not found in:\n"+xml, xml.contains("<img 
src=\"embedded:image2.png\" alt=\"A description...\"/>"));
-      assertTrue("Image not found in:\n"+xml, xml.contains("<img 
src=\"embedded:image3.jpeg\" alt=\"A description...\"/>"));
-      assertTrue("Image not found in:\n"+xml, xml.contains("<img 
src=\"embedded:image4.png\" alt=\"A description...\"/>"));
+      assertTrue("Image not found in:\n"+xml, xml.contains("<img 
src=\"embedded:image2.png\" alt=\"A description...\" />"));
+      assertTrue("Image not found in:\n"+xml, xml.contains("<img 
src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
+      assertTrue("Image not found in:\n"+xml, xml.contains("<img 
src=\"embedded:image4.png\" alt=\"A description...\" />"));
             
       // Text too
       assertTrue(xml.contains("<p>The end!</p>"));
@@ -897,26 +897,10 @@ public class OOXMLParserTest extends Tik
 
     // TIKA-997:
     public void testEmbeddedZipInPPTX() throws Exception {
-        InputStream input = OOXMLParserTest.class.getResourceAsStream(
-              "/test-documents/test_embedded_zip.pptx");
-        Metadata metadata = new Metadata();
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                 SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
-        handler.setResult(new StreamResult(sw));
-
-        try {
-            new OOXMLParser().parse(input, handler, metadata, new 
ParseContext());
-        } finally {
-            input.close();
-        }
-        String xml = sw.toString();
-        int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\"/>");
+        String xml = getXML("test_embedded_zip.pptx").xml;
+        int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\" />");
         int i = xml.indexOf("Send me a note");
-        int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\"/>");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\" />");
         int k = xml.indexOf("<p>No title</p>");
         assertTrue(h != -1);
         assertTrue(i != -1);
@@ -965,8 +949,8 @@ public class OOXMLParserTest extends Tik
     // TIKA-1032:
     public void testEmbeddedPPTXTwoSlides() throws Exception {
         String xml = getXML("testPPT_embedded_two_slides.pptx").xml;
-        assertContains("<div class=\"embedded\" id=\"slide1_rId7\"/>" , xml);
-        assertContains("<div class=\"embedded\" id=\"slide2_rId7\"/>" , xml);
+        assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />" , xml);
+        assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />" , xml);
     }
     
     /**

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Fri Nov  1 02:45:43 2013
@@ -355,7 +355,7 @@ public class PDFParserTest extends TikaT
      */
     public void testLinks() throws Exception {
         final XMLResult result = getXML("testPDFVarious.pdf");
-        assertContains("<div class=\"annotation\"><a 
href=\"http://tika.apache.org/\"/></div>", result.xml);
+        assertContains("<div class=\"annotation\"><a 
href=\"http://tika.apache.org/\"; /></div>", result.xml);
     }
 
     public void testDisableAutoSpace() throws Exception {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 Fri Nov  1 02:45:43 2013
@@ -139,8 +139,8 @@ public class ZipParserTest extends Abstr
     // TIKA-1036
     public void testPlaceholders() throws Exception {
         String xml = getXML("testEmbedded.zip").xml;
-        assertContains("<div class=\"embedded\" id=\"test1.txt\"/>", xml);
-        assertContains("<div class=\"embedded\" id=\"test2.txt\"/>", xml);
+        assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
+        assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
 
         // Also make sure EMBEDDED_RELATIONSHIP_ID was
         // passed when parsing the embedded docs:


Reply via email to