Author: jukka
Date: Tue Jan 26 17:25:18 2010
New Revision: 903329

URL: http://svn.apache.org/viewvc?rev=903329&view=rev
Log:
TIKA-364: [PATCH] Metadata mark for xlsx documents with protected sheets

Patch by Maxim Valyanskiy

Added:
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx 
  (with props)
Modified:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
 Tue Jan 26 17:25:18 2010
@@ -23,4 +23,5 @@
 
     String RESOURCE_NAME_KEY = "resourceName";
 
+    String PROTECTED = "protected";
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Tue Jan 26 17:25:18 2010
@@ -32,6 +32,9 @@
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.exception.TikaException;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.SAXException;
 
@@ -42,10 +45,14 @@
      */
        private final DataFormatter formatter = new DataFormatter();
 
+    private final XSSFExcelExtractor extractor;
+    private static final String TYPE = 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
 
     public XSSFExcelExtractorDecorator(
             XSSFExcelExtractor extractor, Locale locale) {
-        super(extractor, 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+        super(extractor, TYPE);
+
+        this.extractor = extractor;
     }
 
     /**
@@ -129,4 +136,26 @@
             xhtml.element("p", content);
         }
     }
+
+    @Override
+    public MetadataExtractor getMetadataExtractor() {
+        return new MetadataExtractor(extractor, TYPE) {
+            @Override
+            public void extract(Metadata metadata) throws TikaException {
+                super.extract(metadata);
+
+                metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+                XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+                for (int i = 0; i < document.getNumberOfSheets(); i++) {
+                    XSSFSheet sheet = document.getSheetAt(i);
+
+                    if (sheet.getProtect()) {
+                        metadata.set(TikaMetadataKeys.PROTECTED, "true");
+                    }
+                }
+            }
+        };
+    }
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Tue Jan 26 17:25:18 2010
@@ -22,6 +22,7 @@
 import junit.framework.TestCase;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
@@ -67,6 +68,7 @@
             assertFalse(content.contains("9.0"));
             assertTrue(content.contains("196"));
             assertFalse(content.contains("196.0"));
+            assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
         } finally {
             input.close();
         }
@@ -190,4 +192,25 @@
         }
     }
 
+    public void testProtectedExcel() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/protected.xlsx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(input, handler, metadata);
+
+            assertEquals(
+                    
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
+        } finally {
+            input.close();
+        }
+    }
+
 }
\ No newline at end of file

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx?rev=903329&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to