Author: jukka
Date: Fri Nov 1 02:45:43 2013
New Revision: 1537803
URL: http://svn.apache.org/r1537803
Log:
TIKA-817: (PPT/PPTX) Missing date/time in text content
Fix test failure on Windows where Java's built-in XML serializer would produce
different line endings than on Mac/Unix environments
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
Fri Nov 1 02:45:43 2013
@@ -126,6 +126,13 @@ public class ToXMLContentHandler extends
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
+ try {
+ if (currentElement != null
+ && prefix.equals(currentElement.getPrefix(uri))) {
+ return;
+ }
+ } catch (SAXException ignore) {
+ }
namespaces.put(uri, prefix);
}
@@ -182,7 +189,7 @@ public class ToXMLContentHandler extends
}
namespaces.clear();
-
+
// Reset the position in the tree, to avoid endless stack overflow
// chains (see TIKA-1070)
currentElement = currentElement.parent;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Nov
1 02:45:43 2013
@@ -18,13 +18,8 @@ package org.apache.tika;
import java.io.File;
import java.io.InputStream;
-import java.io.StringWriter;
import java.net.URISyntaxException;
import java.net.URL;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
import junit.framework.TestCase;
@@ -32,6 +27,8 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
/**
* Parent class of Tika tests
@@ -89,22 +86,15 @@ public abstract class TikaTest extends T
InputStream input = null;
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
-
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.setResult(new StreamResult(sw));
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
input = getResourceAsStream("/test-documents/" + filePath);
try {
+ ContentHandler handler = new ToXMLContentHandler();
parser.parse(input, handler, metadata, context);
- return new XMLResult(sw.toString(), metadata);
+ return new XMLResult(handler.toString(), metadata);
} finally {
input.close();
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Fri Nov 1 02:45:43 2013
@@ -229,8 +229,8 @@ public class PowerPointParserTest extend
// TIKA-1025
public void testEmbeddedPlacedholder() throws Exception {
XMLResult result = getXML("testPPT_embedded2.ppt");
- assertContains("<div class=\"embedded\" id=\"1\"/>", result.xml);
- assertContains("<div class=\"embedded\" id=\"14\"/>", result.xml);
+ assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+ assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
}
// TIKA-817
@@ -239,7 +239,8 @@ public class PowerPointParserTest extend
//autodate automatically. For pptx, where value is stored,
//value is extracted. For ppt, however, no date is extracted.
XMLResult result = getXML("testPPT_autodate.ppt");
- assertContains("<p class=\"slide-content\">Now<br/>\n"+
- "*<br/>\n*<br/>", result.xml);
+ assertContains(
+ "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
+ result.xml);
}
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Fri Nov 1 02:45:43 2013
@@ -141,7 +141,7 @@ public class WordParserTest extends Tika
// is the pdf file" and before "Bye Bye":
int i = result.indexOf("Here is the pdf file:");
assertTrue(i != -1);
- int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\"/>");
+ int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\"
/>");
assertTrue(j != -1);
int k = result.indexOf("Bye Bye");
assertTrue(k != -1);
@@ -153,14 +153,14 @@ public class WordParserTest extends Tika
// TIKA-982
public void testEmbeddedRTF() throws Exception {
String result = getXML("testWORD_embedded_rtf.doc").xml;
- assertTrue(result.indexOf("<div class=\"embedded\"
id=\"_1404039792\"/>") != -1);
+ assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1404039792\"
/>") != -1);
assertTrue(result.indexOf("_1404039792.rtf") != -1);
}
// TIKA-1019
public void testDocumentLink() throws Exception {
String result = getXML("testDocumentLink.doc").xml;
- assertTrue(result.indexOf("<div class=\"embedded\"
id=\"_1327495610\"/>") != -1);
+ assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\"
/>") != -1);
assertTrue(result.indexOf("_1327495610.unknown") != -1);
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Fri Nov 1 02:45:43 2013
@@ -390,7 +390,7 @@ public class OOXMLParserTest extends Tik
assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
// Headings with anchor tags in them
- assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a
name=\"OnLevel3\"/>Heading Level 3</h3>"));
+ assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level
3</h3>"));
// Bold and italic
assertTrue(xml.contains("<b>BOLD</b>"));
assertTrue(xml.contains("<i>ITALIC</i>"));
@@ -408,9 +408,9 @@ public class OOXMLParserTest extends Tik
xml = result.xml;
// Images 2-4 (there is no 1!)
- assertTrue("Image not found in:\n"+xml, xml.contains("<img
src=\"embedded:image2.png\" alt=\"A description...\"/>"));
- assertTrue("Image not found in:\n"+xml, xml.contains("<img
src=\"embedded:image3.jpeg\" alt=\"A description...\"/>"));
- assertTrue("Image not found in:\n"+xml, xml.contains("<img
src=\"embedded:image4.png\" alt=\"A description...\"/>"));
+ assertTrue("Image not found in:\n"+xml, xml.contains("<img
src=\"embedded:image2.png\" alt=\"A description...\" />"));
+ assertTrue("Image not found in:\n"+xml, xml.contains("<img
src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
+ assertTrue("Image not found in:\n"+xml, xml.contains("<img
src=\"embedded:image4.png\" alt=\"A description...\" />"));
// Text too
assertTrue(xml.contains("<p>The end!</p>"));
@@ -897,26 +897,10 @@ public class OOXMLParserTest extends Tik
// TIKA-997:
public void testEmbeddedZipInPPTX() throws Exception {
- InputStream input = OOXMLParserTest.class.getResourceAsStream(
- "/test-documents/test_embedded_zip.pptx");
- Metadata metadata = new Metadata();
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.setResult(new StreamResult(sw));
-
- try {
- new OOXMLParser().parse(input, handler, metadata, new
ParseContext());
- } finally {
- input.close();
- }
- String xml = sw.toString();
- int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\"/>");
+ String xml = getXML("test_embedded_zip.pptx").xml;
+ int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\" />");
int i = xml.indexOf("Send me a note");
- int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\"/>");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\" />");
int k = xml.indexOf("<p>No title</p>");
assertTrue(h != -1);
assertTrue(i != -1);
@@ -965,8 +949,8 @@ public class OOXMLParserTest extends Tik
// TIKA-1032:
public void testEmbeddedPPTXTwoSlides() throws Exception {
String xml = getXML("testPPT_embedded_two_slides.pptx").xml;
- assertContains("<div class=\"embedded\" id=\"slide1_rId7\"/>" , xml);
- assertContains("<div class=\"embedded\" id=\"slide2_rId7\"/>" , xml);
+ assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />" , xml);
+ assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />" , xml);
}
/**
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Fri Nov 1 02:45:43 2013
@@ -355,7 +355,7 @@ public class PDFParserTest extends TikaT
*/
public void testLinks() throws Exception {
final XMLResult result = getXML("testPDFVarious.pdf");
- assertContains("<div class=\"annotation\"><a
href=\"http://tika.apache.org/\"/></div>", result.xml);
+ assertContains("<div class=\"annotation\"><a
href=\"http://tika.apache.org/\" /></div>", result.xml);
}
public void testDisableAutoSpace() throws Exception {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1537803&r1=1537802&r2=1537803&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Fri Nov 1 02:45:43 2013
@@ -139,8 +139,8 @@ public class ZipParserTest extends Abstr
// TIKA-1036
public void testPlaceholders() throws Exception {
String xml = getXML("testEmbedded.zip").xml;
- assertContains("<div class=\"embedded\" id=\"test1.txt\"/>", xml);
- assertContains("<div class=\"embedded\" id=\"test2.txt\"/>", xml);
+ assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
+ assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
// Also make sure EMBEDDED_RELATIONSHIP_ID was
// passed when parsing the embedded docs: