jukka
Tue, 26 Jan 2010 09:25:44 -0800
Author: jukka Date: Tue Jan 26 17:25:18 2010 New Revision: 903329 URL: http://svn.apache.org/viewvc?rev=903329&view=rev Log: TIKA-364: [PATCH] Metadata mark for xlsx documents with protected sheets Patch by Maxim Valyanskiy Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx (with props) Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java?rev=903329&r1=903328&r2=903329&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java (original) +++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java Tue Jan 26 17:25:18 2010 @@ -23,4 +23,5 @@ String RESOURCE_NAME_KEY = "resourceName"; + String PROTECTED = "protected"; } Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=903329&r1=903328&r2=903329&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Tue Jan 26 17:25:18 2010 @@ -32,6 +32,9 @@ import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.exception.TikaException; import org.apache.xmlbeans.XmlException; import org.xml.sax.SAXException; @@ -42,10 +45,14 @@ */ private final DataFormatter formatter = new DataFormatter(); + private final XSSFExcelExtractor extractor; + private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; public XSSFExcelExtractorDecorator( XSSFExcelExtractor extractor, Locale locale) { - super(extractor, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + super(extractor, TYPE); + + this.extractor = extractor; } /** @@ -129,4 +136,26 @@ xhtml.element("p", content); } } + + @Override + public MetadataExtractor getMetadataExtractor() { + return new MetadataExtractor(extractor, TYPE) { + @Override + public void extract(Metadata metadata) throws TikaException { + super.extract(metadata); + + metadata.set(TikaMetadataKeys.PROTECTED, "false"); + + XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument(); + + for (int i = 0; i < document.getNumberOfSheets(); i++) { + XSSFSheet sheet = document.getSheetAt(i); + + if (sheet.getProtect()) { + metadata.set(TikaMetadataKeys.PROTECTED, "true"); + } + } + } + }; + } } Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=903329&r1=903328&r2=903329&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Tue Jan 26 17:25:18 2010 @@ -22,6 +22,7 @@ import junit.framework.TestCase; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; @@ -67,6 +68,7 @@ assertFalse(content.contains("9.0")); assertTrue(content.contains("196")); assertFalse(content.contains("196.0")); + assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED)); } finally { input.close(); } @@ -190,4 +192,25 @@ } } + public void testProtectedExcel() throws Exception { + InputStream input = OOXMLParserTest.class + .getResourceAsStream("/test-documents/protected.xlsx"); + + Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + try { + parser.parse(input, handler, metadata); + + assertEquals( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED)); + } finally { + input.close(); + } + } + } \ No newline at end of file Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx?rev=903329&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream