Author: nick
Date: Thu Jun 27 12:40:59 2013
New Revision: 1497332
URL: http://svn.apache.org/r1497332
Log:
Patch from Daniel Bonniot from TIKA-1109 - Fetch OOXML metadata earlier, to
tidy code and make it available if required during parsing
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1497332&r1=1497331&r2=1497332&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
Thu Jun 27 12:40:59 2013
@@ -40,7 +40,6 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pkg.ZipContainerDetector;
-import org.apache.tika.sax.EndDocumentShieldingContentHandler;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -105,17 +104,12 @@ public class OOXMLExtractorFactory {
extractor = new POIXMLTextExtractorDecorator(context,
poiExtractor);
}
- // We need to get the content first, but not end
- // the document just yet
- EndDocumentShieldingContentHandler handler =
- new EndDocumentShieldingContentHandler(baseHandler);
- extractor.getXHTML(handler, metadata, context);
-
- // Now we can get the metadata
+ // Get the bulk of the metadata first, so that it's accessible
during
+ // parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
- // Then finish up
- handler.reallyEndDocument();
+ // Extract the text, along with any in-document metadata
+ extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage().startsWith("No supported documents found")) {
throw new TikaException(
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1497332&r1=1497331&r2=1497332&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
Thu Jun 27 12:40:59 2013
@@ -64,8 +64,8 @@ public class XSSFExcelExtractorDecorator
private final XSSFEventBasedExcelExtractor extractor;
private final DataFormatter formatter;
private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
- private final List<Boolean> sheetProtected = new ArrayList<Boolean>();
-
+ private Metadata metadata;
+
public XSSFExcelExtractorDecorator(
ParseContext context, XSSFEventBasedExcelExtractor extractor,
Locale locale) {
super(context, extractor);
@@ -81,6 +81,17 @@ public class XSSFExcelExtractorDecorator
}
}
+ @Override
+ public void getXHTML(
+ ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException, XmlException, IOException, TikaException {
+
+ this.metadata = metadata;
+ metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+ super.getXHTML(handler, metadata, context);
+ }
+
/**
* @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
*/
@@ -164,7 +175,9 @@ public class XSSFExcelExtractorDecorator
sheetParser.parse(sheetSource);
sheetInputStream.close();
- sheetProtected.add(handler.hasProtection);
+ if (handler.hasProtection) {
+ metadata.set(TikaMetadataKeys.PROTECTED, "true");
+ }
} catch(ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " +
e.getMessage());
}
@@ -344,21 +357,4 @@ public class XSSFExcelExtractorDecorator
return parts;
}
-
- @Override
- public MetadataExtractor getMetadataExtractor() {
- return new MetadataExtractor(extractor) {
- @Override
- public void extract(Metadata metadata) throws TikaException {
- super.extract(metadata);
-
- metadata.set(TikaMetadataKeys.PROTECTED, "false");
- for(boolean prot : sheetProtected) {
- if(prot) {
- metadata.set(TikaMetadataKeys.PROTECTED, "true");
- }
- }
- }
- };
- }
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1497332&r1=1497331&r2=1497332&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Thu Jun 27 12:40:59 2013
@@ -227,6 +227,62 @@ public class OOXMLParserTest extends Tik
}
/**
+ * Test that the metadata is already extracted when the body is processed.
+ * See TIKA-1109
+ */
+ public void testPowerPointMetadataEarly() throws Exception {
+ String[] extensions = new String[] {
+ "pptx", "pptm", "ppsm", "ppsx", "potm"
+ //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
+ //"xps" // TIKA-418: Not yet supported by POI
+ };
+
+ final String[] mimeTypes = new String[] {
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+ "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+ "application/vnd.ms-powerpoint.template.macroenabled.12"
+ };
+
+ for (int i=0; i<extensions.length; i++) {
+ String extension = extensions[i];
+ final String filename = "testPPT." + extension;
+
+ Parser parser = new AutoDetectParser();
+ final Metadata metadata = new Metadata();
+ // TODO: should auto-detect without the resource name
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+
+ // Allow the value to be access from the inner class
+ final int currentI = i;
+ ContentHandler handler = new BodyContentHandler()
+ {
+ public void startDocument ()
+ {
+ assertEquals(
+ "Mime-type checking for " + filename,
+ mimeTypes[currentI],
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Attachment Test",
metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Rajiv",
metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
+
+ }
+
+ };
+ ParseContext context = new ParseContext();
+
+ InputStream input = getTestDocument(filename);
+ try {
+ parser.parse(input, handler, metadata, context);
+ } finally {
+ input.close();
+ }
+ }
+ }
+
+ /**
* For the PowerPoint formats we don't currently support, ensure that
* we don't break either
*/