Author: nick
Date: Mon Dec 22 01:25:24 2014
New Revision: 1647234
URL: http://svn.apache.org/r1647234
Log:
Upgrade to POI 3.11 final, patch from TIKA-1469
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1647234&r1=1647233&r2=1647234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon Dec 22 01:25:24 2014
@@ -36,7 +36,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.11-beta2</poi.version>
+ <poi.version>3.11</poi.version>
<codec.version>1.9</codec.version>
<!-- NOTE: sync with POI -->
<compress.version>1.8.1</compress.version>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1647234&r1=1647233&r2=1647234&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
Mon Dec 22 01:25:24 2014
@@ -159,8 +159,8 @@ public class MetadataExtractor {
Metadata metadata) {
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
props = properties.getUnderlyingProperties();
-
- for(CTProperty property : props.getPropertyList()) {
+ for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
+ CTProperty property = props.getPropertyArray(i);
String val = null;
Date date = null;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1647234&r1=1647233&r2=1647234&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
Mon Dec 22 01:25:24 2014
@@ -16,10 +16,10 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import javax.xml.namespace.QName;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import javax.xml.namespace.QName;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
@@ -50,6 +50,7 @@ import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -99,7 +100,8 @@ public class XSLFPowerPointExtractorDeco
// comments (if present)
XSLFComments comments = slide.getComments();
if (comments != null) {
- for (CTComment comment :
comments.getCTCommentsList().getCmList()) {
+ for (int i = 0; i < comments.getNumberOfComments(); i++) {
+ CTComment comment = comments.getCommentAt(i);
xhtml.element("p", comment.getText());
}
}
@@ -181,32 +183,35 @@ public class XSLFPowerPointExtractorDeco
} catch(Exception e) {
throw new TikaException(e.getMessage()); // Shouldn't happen
}
-
- for (CTSlideIdListEntry ctSlide :
document.getSlideReferences().getSldIdList()) {
- // Add the slide
- PackagePart slidePart;
- try {
- slidePart = document.getSlidePart(ctSlide);
- } catch(IOException e) {
- throw new TikaException("Broken OOXML file", e);
- } catch(XmlException xe) {
- throw new TikaException("Broken OOXML file", xe);
- }
- parts.add(slidePart);
-
- // If it has drawings, return those too
- try {
- for(PackageRelationship rel :
slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
- if(rel.getTargetMode() == TargetMode.INTERNAL) {
- PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
- parts.add( rel.getPackage().getPart(relName) );
- }
- }
- } catch(InvalidFormatException e) {
- throw new TikaException("Broken OOXML file", e);
- }
- }
+ CTSlideIdList ctSlideIdList = document.getSlideReferences();
+ if (ctSlideIdList != null) {
+ for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
+ CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
+ // Add the slide
+ PackagePart slidePart;
+ try {
+ slidePart = document.getSlidePart(ctSlide);
+ } catch (IOException e) {
+ throw new TikaException("Broken OOXML file", e);
+ } catch (XmlException xe) {
+ throw new TikaException("Broken OOXML file", xe);
+ }
+ parts.add(slidePart);
+
+ // If it has drawings, return those too
+ try {
+ for (PackageRelationship rel :
slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Broken OOXML file", e);
+ }
+ }
+ }
return parts;
}
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1647234&r1=1647233&r2=1647234&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Mon Dec 22 01:25:24 2014
@@ -16,12 +16,11 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import javax.xml.namespace.QName;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import javax.xml.namespace.QName;
-
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
@@ -32,7 +31,6 @@ import org.apache.poi.xwpf.usermodel.IBo
import org.apache.poi.xwpf.usermodel.ICell;
import org.apache.poi.xwpf.usermodel.IRunElement;
import org.apache.poi.xwpf.usermodel.ISDTContent;
-import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
@@ -42,6 +40,7 @@ import org.apache.poi.xwpf.usermodel.XWP
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFSDT;
+import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
import org.apache.poi.xwpf.usermodel.XWPFStyle;
import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.poi.xwpf.usermodel.XWPFTable;
@@ -197,7 +196,8 @@ public class XWPFWordExtractorDecorator
// Attach bookmarks for the paragraph
// (In future, we might put them in the right place, for now
// we just put them in the correct paragraph)
- for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
+ for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++)
{
+ CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
xhtml.startElement("a", "name", bookmark.getName());
xhtml.endElement("a");
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1647234&r1=1647233&r2=1647234&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Mon Dec 22 01:25:24 2014
@@ -16,21 +16,20 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
+import java.util.HashMap;
import java.util.Locale;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
+import java.util.Map;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -41,11 +40,15 @@ import org.apache.tika.metadata.TikaMeta
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
public class OOXMLParserTest extends TikaTest {
private Parser parser = new AutoDetectParser();
@@ -1089,5 +1092,56 @@ public class OOXMLParserTest extends Tik
assertTrue(a < b);
}
+ @Test
+ public void testEncrypted() throws Exception {
+ Map<String, String> tests = new HashMap<String, String>();
+ tests.put("testWORD_protected_passtika.docx",
+ "This is an encrypted Word 2007 File");
+ tests.put("testPPT_protected_passtika.pptx",
+ "This is an encrypted PowerPoint 2007 slide.");
+ tests.put("testEXCEL_protected_passtika.xlsx",
+ "This is an Encrypted Excel spreadsheet.");
+
+ Parser parser = new AutoDetectParser();
+ Metadata m = new Metadata();
+ PasswordProvider passwordProvider = new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "tika";
+ }
+ };
+ ParseContext passwordContext = new ParseContext();
+ passwordContext.set(org.apache.tika.parser.PasswordProvider.class,
passwordProvider);
+
+ for (Map.Entry<String, String> e : tests.entrySet()) {
+ InputStream is = null;
+ try {
+ is = getTestDocument(e.getKey());
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(is, handler, m, passwordContext);
+ assertContains(e.getValue(), handler.toString());
+ } finally {
+ is.close();
+ }
+ }
+
+ ParseContext context = new ParseContext();
+ //now try with no password
+ for (Map.Entry<String, String> e : tests.entrySet()) {
+ InputStream is = null;
+ boolean exc = false;
+ try {
+ is = getTestDocument(e.getKey());
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(is, handler, m, context);
+ } catch (EncryptedDocumentException ex) {
+ exc = true;
+ } finally {
+ is.close();
+ }
+ assertTrue(exc);
+ }
+
+ }
}