Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,378 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.opendocument.OpenOfficeParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class ODFParserTest extends TikaTest { + /** + * For now, allow us to run some tests against both + * the old and the new parser + */ + private Parser[] getParsers() { + return new Parser[] { + new OpenDocumentParser(), + new OpenOfficeParser() + }; + } + + @Test + public void testOO3() throws Exception { + for (Parser parser : getParsers()) { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testODFwithOOo3.odt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + parser.parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.oasis.opendocument.text", + metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + assertContains("Tika is part of the Lucene project.", content); + assertContains("Solr", content); + assertContains("one embedded", content); + assertContains("Rectangle Title", content); + assertContains("a blue background and dark border", content); + } + } + } + + @Test + public void testOO2() throws Exception { + for (Parser parser : getParsers()) { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testOpenOffice2.odt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + parser.parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.oasis.opendocument.text", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("en-US", metadata.get(Metadata.LANGUAGE)); + assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME)); + assertEquals( + "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161", + metadata.get("generator")); + + // Check date metadata, both old-style and new-style + assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED)); + assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE)); + assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE)); + + // Check the document statistics + assertEquals("1", metadata.get(Office.PAGE_COUNT)); + assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT)); + assertEquals("14", metadata.get(Office.WORD_COUNT)); + assertEquals("78", metadata.get(Office.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Office.TABLE_COUNT)); + assertEquals("0", metadata.get(Office.OBJECT_COUNT)); + assertEquals("0", metadata.get(Office.IMAGE_COUNT)); + + // Check the Tika-1.0 style document statistics + assertEquals("1", metadata.get(Metadata.PAGE_COUNT)); + assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT)); + assertEquals("14", metadata.get(Metadata.WORD_COUNT)); + assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); + assertEquals("0", metadata.get(Metadata.OBJECT_COUNT)); + assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); + + // Check the very old style statistics (these will be removed shortly) + assertEquals("0", metadata.get("nbTab")); + assertEquals("0", metadata.get("nbObject")); + assertEquals("0", metadata.get("nbImg")); + assertEquals("1", metadata.get("nbPage")); + assertEquals("1", metadata.get("nbPara")); + assertEquals("14", metadata.get("nbWord")); + assertEquals("78", metadata.get("nbCharacter")); + + // Custom metadata tags present but without values + assertEquals(null, metadata.get("custom:Info 1")); + assertEquals(null, metadata.get("custom:Info 2")); + assertEquals(null, metadata.get("custom:Info 3")); + assertEquals(null, metadata.get("custom:Info 4")); + + String content = handler.toString(); + assertTrue(content.contains( + "This is a sample Open Office document," + + " written in NeoOffice 2.2.1 for the Mac.")); + } + } + } + + /** + * Similar to {@link #testXMLParser()}, but using a different + * OO2 file with different metadata in it + */ + @Test + public void testOO2Metadata() throws Exception { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testOpenOffice2.odf")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OpenDocumentParser().parse(input, handler, metadata); + + assertEquals( + "application/vnd.oasis.opendocument.formula", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE)); + assertEquals("The quick brown fox jumps over the lazy dog", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(Metadata.SUBJECT)); + assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME)); + assertEquals("1", metadata.get("editing-cycles")); + assertEquals( + "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134", + metadata.get("generator")); + assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS)); + + // User defined metadata + assertEquals("Text 1", metadata.get("custom:Info 1")); + assertEquals("2", metadata.get("custom:Info 2")); + assertEquals("false", metadata.get("custom:Info 3")); + assertEquals("true", metadata.get("custom:Info 4")); + + // No statistics present + assertEquals(null, metadata.get(Metadata.PAGE_COUNT)); + assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT)); + assertEquals(null, metadata.get(Metadata.WORD_COUNT)); + assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals(null, metadata.get(Metadata.TABLE_COUNT)); + assertEquals(null, metadata.get(Metadata.OBJECT_COUNT)); + assertEquals(null, metadata.get(Metadata.IMAGE_COUNT)); + assertEquals(null, metadata.get("nbTab")); + assertEquals(null, metadata.get("nbObject")); + assertEquals(null, metadata.get("nbImg")); + assertEquals(null, metadata.get("nbPage")); + assertEquals(null, metadata.get("nbPara")); + assertEquals(null, metadata.get("nbWord")); + assertEquals(null, metadata.get("nbCharacter")); + + // Note - contents of maths files not currently supported + String content = handler.toString(); + assertEquals("", content); + } + } + + /** + * Similar to {@link #testXMLParser()}, but using an OO3 file + */ + @Test + public void testOO3Metadata() throws Exception { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testODFwithOOo3.odt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OpenDocumentParser().parse(input, handler, metadata); + + assertEquals( + "application/vnd.oasis.opendocument.text", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE)); + assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Test document", metadata.get(Metadata.SUBJECT)); + assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Bart Hanssens", metadata.get("initial-creator")); + assertEquals("2", metadata.get("editing-cycles")); + assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME)); + assertEquals( + "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420", + metadata.get("generator")); + assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS)); + + // User defined metadata + assertEquals("Bart Hanssens", metadata.get("custom:Editor")); + assertEquals(null, metadata.get("custom:Info 2")); + assertEquals(null, metadata.get("custom:Info 3")); + assertEquals(null, metadata.get("custom:Info 4")); + + // Check the document statistics + assertEquals("2", metadata.get(Office.PAGE_COUNT)); + assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT)); + assertEquals("54", metadata.get(Office.WORD_COUNT)); + assertEquals("351", metadata.get(Office.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Office.TABLE_COUNT)); + assertEquals("2", metadata.get(Office.OBJECT_COUNT)); + assertEquals("0", metadata.get(Office.IMAGE_COUNT)); + + // Check the Tika-1.0 style document statistics + assertEquals("2", metadata.get(Metadata.PAGE_COUNT)); + assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT)); + assertEquals("54", metadata.get(Metadata.WORD_COUNT)); + assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); + assertEquals("2", metadata.get(Metadata.OBJECT_COUNT)); + assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); + + // Check the old style statistics (these will be removed shortly) + assertEquals("0", metadata.get("nbTab")); + assertEquals("2", metadata.get("nbObject")); + assertEquals("0", metadata.get("nbImg")); + assertEquals("2", metadata.get("nbPage")); + assertEquals("13", metadata.get("nbPara")); + assertEquals("54", metadata.get("nbWord")); + assertEquals("351", metadata.get("nbCharacter")); + + String content = handler.toString(); + assertTrue(content.contains( + "Apache Tika Tika is part of the Lucene project." + )); + } + } + + @Test + public void testODPMasterFooter() throws Exception { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testMasterFooter.odp")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new AutoDetectParser().parse(input, handler, metadata); + + String content = handler.toString(); + assertContains("Master footer is here", content); + } + } + + @Test + public void testODTFooter() throws Exception { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testFooter.odt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new AutoDetectParser().parse(input, handler, metadata); + + String content = handler.toString(); + assertContains("Here is some text...", content); + assertContains("Here is some text on page 2", content); + assertContains("Here is footer text", content); + } + } + + @Test + public void testODSFooter() throws Exception { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testFooter.ods")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new AutoDetectParser().parse(input, handler, metadata); + + String content = handler.toString(); + assertContains("Here is a footer in the center area", content); + } + } + + @Test + public void testFromFile() throws Exception { + try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource( + "/test-documents/testODFwithOOo3.odt"))) { + assertEquals(true, tis.hasFile()); + OpenDocumentParser parser = new OpenDocumentParser(); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + parser.parse(tis, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.oasis.opendocument.text", + metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + assertContains("Tika is part of the Lucene project.", content); + } + } + + @Test + public void testNPEFromFile() throws Exception { + OpenDocumentParser parser = new OpenDocumentParser(); + try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource( + "/test-documents/testNPEOpenDocument.odt"))) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + parser.parse(tis, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.oasis.opendocument.text", + metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + assertContains("primero hay que generar un par de claves", content); + } + } + + // TIKA-1063: Test basic style support. + @Test + public void testODTStyles() throws Exception { + String xml = getXML("testStyles.odt").xml; + assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml); + assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml); + assertContains("<ol>\t<li><p>One</p>", xml); + assertContains("</ol>", xml); + assertContains("<ul>\t<li><p>First</p>", xml); + assertContains("</ul>", xml); + } + + //TIKA-1600: Test that null pointer doesn't break parsing. + @Test + public void testNullStylesInODTFooter() throws Exception { + Parser parser = new OpenDocumentParser(); + try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + parser.parse(input, handler, metadata, new ParseContext()); + + assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + + assertContains("Utilisation de ce document", content); + assertContains("Copyright and License", content); + assertContains("Changer la langue", content); + assertContains("La page dâaccueil permet de faire une recherche simple", content); + } + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,592 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.rtf; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.FilenameUtils; +import org.apache.tika.Tika; +import org.apache.tika.TikaTest; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.WriteOutContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Junit test class for the Tika {@link RTFParser} + */ +public class RTFParserTest extends TikaTest { + + private Tika tika = new Tika(); + + @Test + public void testBasicExtraction() throws Exception { + File file = getResourceAsFile("/test-documents/testRTF.rtf"); + + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + tika.getParser().parse( + new FileInputStream(file), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + String content = writer.toString(); + + assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); + assertContains("Test", content); + assertContains("indexation Word", content); + } + + @Test + public void testUmlautSpacesExtraction2() throws Exception { + String content = getText("testRTFUmlautSpaces2.rtf"); + content = content.replaceAll("\\s+", ""); + assertEquals("\u00DCbersicht", content); + } + + @Test + public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception { + String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf"); + + assertContains("\u5E74", content); + assertContains("\u5ff5", content); + assertContains("0 ", content); + assertContains("abc", content); + assertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74")); + } + + @Test + public void testHexEscapeInsideWord() throws Exception { + String content = getText("testRTFHexEscapeInsideWord.rtf"); + assertContains("ESP\u00cdRITO", content); + } + + @Test + public void testWindowsCodepage1250() throws Exception { + String content = getText("testRTFWindowsCodepage1250.rtf"); + assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content); + assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content); + } + + @Test + public void testTableCellSeparation() throws Exception { + File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf"); + String content = tika.parseToString(file); + content = content.replaceAll("\\s+", " "); + assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); + assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); + } + + @Test + public void testTableCellSeparation2() throws Exception { + String content = getText("testRTFTableCellSeparation2.rtf"); + // TODO: why do we insert extra whitespace...? + content = content.replaceAll("\\s+", " "); + assertContains("Station Fax", content); + } + + @Test + public void testWordPadCzechCharactersExtraction() throws Exception { + File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf"); + String s1 = tika.parseToString(file); + assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne")); + assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty")); + } + + @Test + public void testWord2010CzechCharactersExtraction() throws Exception { + File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf"); + String s1 = tika.parseToString(file); + assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne")); + assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty")); + } + + @Test + public void testMS932Extraction() throws Exception { + File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf"); + String s1 = tika.parseToString(file); + + // Hello in Japanese + assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f")); + + // Verify title, since it was also encoded with MS932: + Result r = getResult("testRTF-ms932.rtf"); + assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE)); + } + + @Test + public void testUmlautSpacesExtraction() throws Exception { + File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf"); + String s1 = tika.parseToString(file); + assertTrue(s1.contains("\u00DCbersicht")); + } + + @Test + public void testGothic() throws Exception { + String content = getText("testRTFUnicodeGothic.rtf"); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + } + + @Test + public void testJapaneseText() throws Exception { + Result r = getResult("testRTFJapanese.rtf"); + String content = r.text; + + // Verify title -- this title uses upr escape inside + // title info field: + assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000", + r.metadata.get(TikaCoreProperties.TITLE)); + assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR)); + assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS)); + + // Special version of (GHQ) + assertContains("\uff08\uff27\uff28\uff31\uff09", content); + + // 6 other characters + assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content); + } + + @Test + public void testMaxLength() throws Exception { + File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf"); + Metadata metadata = new Metadata(); + InputStream stream = TikaInputStream.get(file, metadata); + + // Test w/ default limit: + Tika localTika = new Tika(); + String content = localTika.parseToString(stream, metadata); + // parseToString closes for convenience: + //stream.close(); + assertTrue(content.length() > 500); + + // Test setting max length on the instance: + localTika.setMaxStringLength(200); + stream = TikaInputStream.get(file, metadata); + content = localTika.parseToString(stream, metadata); + + // parseToString closes for convenience: + //stream.close(); + assertTrue(content.length() <= 200); + + // Test setting max length per-call: + stream = TikaInputStream.get(file, metadata); + content = localTika.parseToString(stream, metadata, 100); + // parseToString closes for convenience: + //stream.close(); + assertTrue(content.length() <= 100); + } + + @Test + public void testTextWithCurlyBraces() throws Exception { + String content = getText("testRTFWithCurlyBraces.rtf"); + assertContains("{ some text inside curly brackets }", content); + } + + @Test + public void testControls() throws Exception { + Result r = getResult("testRTFControls.rtf"); + String content = r.text; + assertContains("Thiswordhasanem\u2014dash", content); + assertContains("Thiswordhasanen\u2013dash", content); + assertContains("Thiswordhasanon\u2011breakinghyphen", content); + assertContains("Thiswordhasanonbreaking\u00a0space", content); + assertContains("Thiswordhasanoptional\u00adhyphen", content); + assertContains("\u2018Single quoted text\u2019", content); + assertContains("\u201cDouble quoted text\u201d", content); + assertContains("\u201cDouble quoted text again\u201d", content); + } + + @Test + public void testInvalidUnicode() throws Exception { + Result r = getResult("testRTFInvalidUnicode.rtf"); + String content = r.text; + assertContains("Unpaired hi \ufffd here", content); + assertContains("Unpaired lo \ufffd here", content); + assertContains("Mismatched pair \ufffd\ufffd here", content); + } + + @Test + public void testVarious() throws Exception { + Result r = getResult("testRTFVarious.rtf"); + String content = r.text; + assertContains("Footnote appears here", content); + assertContains("This is a footnote.", content); + assertContains("This is the header text.", content); + assertContains("This is the footer text.", content); + assertContains("Here is a text box", content); + assertContains("Bold", content); + assertContains("italic", content); + assertContains("underline", content); + assertContains("superscript", content); + assertContains("subscript", content); + assertContains("Here is a citation:", content); + assertContains("Figure 1 This is a caption for Figure 1", content); + assertContains("(Kramer)", content); + + // Table + assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " ")); + + // 2-columns + assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " ")); + assertContains("This is a hyperlink", content); + assertContains("Here is a list:", content); + for (int row = 1; row <= 3; row++) { + assertContains("Bullet " + row, content); + } + assertContains("Here is a numbered list:", content); + for (int row = 1; row <= 3; row++) { + assertContains("Number bullet " + row, content); + } + + for (int row = 1; row <= 2; row++) { + for (int col = 1; col <= 3; col++) { + assertContains("Row " + row + " Col " + col, content); + } + } + + assertContains("Keyword1 Keyword2", content); + assertEquals("Keyword1 Keyword2", + r.metadata.get(TikaCoreProperties.KEYWORDS)); + + assertContains("Subject is here", content); + assertEquals("Subject is here", + r.metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Subject is here", + r.metadata.get(Metadata.SUBJECT)); + + assertContains("Suddenly some Japanese text:", content); + // Special version of (GHQ) + assertContains("\uff08\uff27\uff28\uff31\uff09", content); + // 6 other characters + assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content); + + assertContains("And then some Gothic text:", content); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + } + + @Test + public void testVariousStyle() throws Exception { + String content = getXML("testRTFVarious.rtf").xml; + assertContains("<b>Bold</b>", content); + assertContains("<i>italic</i>", content); + } + + @Test + public void testBoldItalic() throws Exception { + String content = getXML("testRTFBoldItalic.rtf").xml; + assertContains("<b>bold</b>", content); + assertContains("<b>bold </b><b><i>italic</i></b>", content); + assertContains("<b><i>italic </i></b><b>bold</b>", content); + assertContains("<i>italic</i>", content); + assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content); + assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content); + } + + @Test + public void testHyperlink() throws Exception { + String content = getXML("testRTFHyperlink.rtf").xml; + assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content); + assertEquals(-1, content.indexOf("<p>\t\t</p>")); + } + + @Test + public void testIgnoredControlWord() throws Exception { + assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml); + } + + @Test + public void testFontAfterBufferedText() throws Exception { + assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!", + getXML("testFontAfterBufferedText.rtf").xml); + } + + @Test + public void testListMicrosoftWord() throws Exception { + String content = getXML("testRTFListMicrosoftWord.rtf").xml; + assertContains("<ol>\t<li>one</li>", content); + assertContains("</ol>", content); + assertContains("<ul>\t<li>first</li>", content); + assertContains("</ul>", content); + } + + @Test + public void testListLibreOffice() throws Exception { + String content = getXML("testRTFListLibreOffice.rtf").xml; + assertContains("<ol>\t<li>one</li>", content); + assertContains("</ol>", content); + assertContains("<ul>\t<li>first</li>", content); + assertContains("</ul>", content); + } + + // TIKA-782 + @Test + public void testBinControlWord() throws Exception { + ByteCopyingHandler embHandler = new ByteCopyingHandler(); + try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) { + ContainerExtractor ex = new ParserContainerExtractor(); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, embHandler); + } + assertEquals(1, embHandler.bytes.size()); + + byte[] bytes = embHandler.bytes.get(0); + assertEquals(10, bytes.length); + //} + assertEquals(125, (int) bytes[4]); + //make sure that at least the last value is correct + assertEquals(-1, (int) bytes[9]); + } + + // TIKA-999 + @Test + public void testMetaDataCounts() throws Exception { + XMLResult xml = getXML("test_embedded_package.rtf"); + assertEquals("1", xml.metadata.get(Office.PAGE_COUNT)); + assertEquals("7", xml.metadata.get(Office.WORD_COUNT)); + assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT)); + assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T")); + } + + // TIKA-1192 + @Test + public void testListOverride() throws Exception { + Result r = getResult("testRTFListOverride.rtf"); + String content = r.text; + assertContains("Body", content); + } + + // TIKA-1305 + @Test + public void testCorruptListOverride() throws Exception { + Result r = getResult("testRTFCorruptListOverride.rtf"); + String content = r.text; + assertContains("apple", content); + } + + // TIKA-1010 + @Test + public void testEmbeddedMonster() throws Exception { + Set<MediaType> skipTypes = new HashSet<MediaType>(); + skipTypes.add(MediaType.parse("application/x-emf")); + skipTypes.add(MediaType.parse("application/x-msmetafile")); + + + List<String> trueNames = new ArrayList<String>(); + trueNames.add("file_0.doc"); + trueNames.add("Hw.txt"); + trueNames.add("file_1.xlsx"); + trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip"); + trueNames.add("html-within-zip.zip"); + trueNames.add("text.html"); + trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html"); + trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); + trueNames.add("file_2.xls"); + trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg"); + trueNames.add("file_3.pdf"); + trueNames.add("file_4.ppt"); + trueNames.add("file_5.pptx"); + trueNames.add("thumbnail.jpeg"); + trueNames.add("file_6.doc"); + trueNames.add("file_7.doc"); + trueNames.add("file_8.docx"); + trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); + + List<String> trueTypes = new ArrayList<String>(); + trueTypes.add("application/msword"); + trueTypes.add("text/plain"); + trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + trueTypes.add("application/zip"); + trueTypes.add("application/zip"); + trueTypes.add("text/html"); + trueTypes.add("text/html"); + trueTypes.add("image/jpeg"); + trueTypes.add("application/vnd.ms-excel"); + trueTypes.add("application/vnd.ms-outlook"); + trueTypes.add("application/pdf"); + trueTypes.add("application/vnd.ms-powerpoint"); + trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation"); + trueTypes.add("image/jpeg"); + trueTypes.add("application/msword"); + trueTypes.add("application/msword"); + trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + trueTypes.add("image/jpeg"); + + TrackingHandler tracker = new TrackingHandler(skipTypes); + try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) { + ContainerExtractor ex = new ParserContainerExtractor(); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + } + + assertEquals(trueNames.size(), tracker.filenames.size()); + assertEquals(trueTypes.size(), tracker.mediaTypes.size()); + for (int i = 0; i < tracker.filenames.size(); i++) { + String expectedName = trueNames.get(i); + if (expectedName == null) { + assertNull(tracker.filenames.get(i)); + } else { + assertNotNull(tracker.filenames.get(i)); + //necessary to getName() because MSOffice extractor includes + //directory: _1457338524/HW.txt + assertEquals("filename equals ", + expectedName, FilenameUtils.getName(tracker.filenames.get(i))); + } + assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString()); + } + + tracker = new TrackingHandler(); + try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) { + ContainerExtractor ex = new ParserContainerExtractor(); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + } + assertEquals(47, tracker.filenames.size()); + assertEquals("thumbnail_26.emf", tracker.filenames.get(45)); + assertEquals("thumbnail_27.wmf", tracker.filenames.get(46)); + } + + //TIKA-1010 test regular (not "embedded") images/picts + public void testRegularImages() throws Exception { + Parser base = new AutoDetectParser(); + ParseContext ctx = new ParseContext(); + RecursiveParserWrapper parser = new RecursiveParserWrapper(base, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + ctx.set(org.apache.tika.parser.Parser.class, parser); + ContentHandler handler = new BodyContentHandler(); + Metadata rootMetadata = new Metadata(); + rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf"); + try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) { + parser.parse(tis, handler, rootMetadata, ctx); + } + List<Metadata> metadatas = parser.getMetadata(); + + Metadata meta_jpg_exif = metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg"); + Metadata meta_jpg = metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); + + assertTrue(meta_jpg_exif != null); + assertTrue(meta_jpg != null); + assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor")); + assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache")); + //make sure old metadata doesn't linger between objects + assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor")); + assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL)); + assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL)); + + assertEquals(40, meta_jpg.names().length); + assertEquals(105, meta_jpg.names().length); + } + + @Test + public void testMultipleNewlines() throws Exception { + String content = getXML("testRTFNewlines.rtf").xml; + content = content.replaceAll("[\r\n]+", " "); + assertContains("<body><p>one</p> " + + "<p /> " + + "<p>two</p> " + + "<p /> " + + "<p /> " + + "<p>three</p> " + + "<p /> " + + "<p /> " + + "<p /> " + + "<p>four</p>", content); + } + + //TIKA-1010 test linked embedded doc + @Test + public void testEmbeddedLinkedDocument() throws Exception { + Set<MediaType> skipTypes = new HashSet<MediaType>(); + skipTypes.add(MediaType.parse("application/x-emf")); + skipTypes.add(MediaType.parse("application/x-msmetafile")); + + TrackingHandler tracker = new TrackingHandler(skipTypes); + try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) { + ContainerExtractor ex = new ParserContainerExtractor(); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + } + //should gracefully skip link and not throw NPE, IOEx, etc + assertEquals(0, tracker.filenames.size()); + + tracker = new TrackingHandler(); + try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) { + ContainerExtractor ex = new ParserContainerExtractor(); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + } + //should gracefully skip link and not throw NPE, IOEx, etc + assertEquals(2, tracker.filenames.size()); + } + + private Result getResult(String filename) throws Exception { + File file = getResourceAsFile("/test-documents/" + filename); + + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + tika.getParser().parse( + new FileInputStream(file), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + String content = writer.toString(); + return new Result(content, metadata); + } + + private String getText(String filename) throws Exception { + return getResult(filename).text; + } + + private static class Result { + public final String text; + public final Metadata metadata; + + public Result(String text, Metadata metadata) { + this.text = text; + this.metadata = metadata; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.solidworks; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class SolidworksParserTest extends TikaTest { + + /** + * Test the parsing of an solidWorks part in version 2013SP2 + */ + @Test + public void testPart2013SP2Parser() throws Exception { + try (InputStream input = SolidworksParserTest.class.getResourceAsStream( + "/test-documents/testsolidworksPart2013SP2.SLDPRT")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + //Check content type + assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE)); + + //Check properties + assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("2013-09-06T08:12:12Z", metadata.get(Metadata.MODIFIED)); + assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals(null, metadata.get(TikaCoreProperties.RELATION)); + assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS)); + assertEquals(null, metadata.get(TikaCoreProperties.SOURCE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS)); + } + } + + /** + * Test the parsing of an solidWorks part in version 2014SP0 + */ + @Test + public void testPart2014SP0Parser() throws Exception { + try (InputStream input = SolidworksParserTest.class.getResourceAsStream( + "/test-documents/testsolidworksPart2014SP0.SLDPRT")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + //Check content type + assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE)); + + //Check properties + assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("2013-11-28T12:38:28Z", metadata.get(Metadata.MODIFIED)); + assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals(null, metadata.get(TikaCoreProperties.RELATION)); + assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS)); + assertEquals(null, metadata.get(TikaCoreProperties.SOURCE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS)); + } + } + + /** + * Test the parsing of an solidWorks assembly in version 2013SP2 + */ + @Test + public void testAssembly2013SP2Parser() throws Exception { + try (InputStream input = SolidworksParserTest.class.getResourceAsStream( + "/test-documents/testsolidworksAssembly2013SP2.SLDASM")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + //Check content type + assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE)); + + //Check properties + assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("2013-09-06T08:11:08Z", metadata.get(Metadata.MODIFIED)); + assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals(null, metadata.get(TikaCoreProperties.RELATION)); + assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS)); + assertEquals(null, metadata.get(TikaCoreProperties.SOURCE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS)); + } + } + + /** + * Test the parsing of an solidWorks assembly in version 2014SP0 + */ + @Test + public void testAssembly2014SP0Parser() throws Exception { + InputStream input = SolidworksParserTest.class.getResourceAsStream( + "/test-documents/testsolidworksAssembly2014SP0.SLDASM"); + try { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + //Check content type + assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE)); + + //Check properties + assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED)); + assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals(null, metadata.get(TikaCoreProperties.RELATION)); + assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS)); + assertEquals(null, metadata.get(TikaCoreProperties.SOURCE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS)); + } finally { + input.close(); + } + } + + /* + * Test the parsing of an solidWorks drawing in version 2013SP2 + */ + @Test + public void testDrawing2013SP2Parser() throws Exception { + try (InputStream input = SolidworksParserTest.class.getResourceAsStream( + "/test-documents/testsolidworksDrawing2013SP2.SLDDRW")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + //Check content type + assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE)); + + //Check properties + assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("2013-09-06T08:06:57Z", metadata.get(Metadata.MODIFIED)); + assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals(null, metadata.get(TikaCoreProperties.RELATION)); + assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS)); + assertEquals(null, metadata.get(TikaCoreProperties.SOURCE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS)); + } + } + + /** + * Test the parsing of an solidWorks drawing in version 2014SP0 + */ + @Test + public void testDrawing2014SP0Parser() throws Exception { + try (InputStream input = SolidworksParserTest.class.getResourceAsStream( + "/test-documents/testsolidworksDrawing2014SP0.SLDDRW")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + //Check content type + assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE)); + + //Check properties + assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED)); + assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals(null, metadata.get(TikaCoreProperties.RELATION)); + assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS)); + assertEquals(null, metadata.get(TikaCoreProperties.SOURCE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS)); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/pom.xml?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -0,0 +1,91 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-package-parser-module</artifactId> + <name>Apache Tika Package Parser Module</name> + <url>http://tika.apache.org/</url> + + <properties> + <!-- NOTE: sync tukaani version with commons-compress --> + <tukaani.version>1.5</tukaani.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-ooxml</artifactId> + <version>${poi.version}</version> + <exclusions> + <exclusion> + <groupId>stax</groupId> + <artifactId>stax-api</artifactId> + </exclusion> + <exclusion> + <groupId>xml-apis</groupId> + <artifactId>xml-apis</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.tukaani</groupId> + <artifactId>xz</artifactId> + <version>${tukaani.version}</version> + </dependency> + <dependency> + <groupId>com.github.junrar</groupId> + <artifactId>junrar</artifactId> + <version>0.7</version> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>${commons.compress.version}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/appended-resources/META-INF/LICENSE URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/appended-resources/META-INF/LICENSE?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/appended-resources/META-INF/LICENSE (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/appended-resources/META-INF/LICENSE Sat Jan 16 18:23:01 2016 @@ -0,0 +1,48 @@ +APACHE TIKA SUBCOMPONENTS + +Apache Tika includes a number of subcomponents with separate copyright notices +and license terms. Your use of these subcomponents is subject to the terms and +conditions of the following licenses. + +JUnRAR (https://github.com/edmund-wagner/junrar/) + + JUnRAR is based on the UnRAR tool, and covered by the same license + It was formerly available from http://java-unrar.svn.sourceforge.net/ + + ****** ***** ****** UnRAR - free utility for RAR archives + ** ** ** ** ** ** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ****** ******* ****** License for use and distribution of + ** ** ** ** ** ** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ** ** ** ** ** ** FREE portable version + ~~~~~~~~~~~~~~~~~~~~~ + + The source code of UnRAR utility is freeware. This means: + + 1. All copyrights to RAR and the utility UnRAR are exclusively + owned by the author - Alexander Roshal. + + 2. The UnRAR sources may be used in any software to handle RAR + archives without limitations free of charge, but cannot be used + to re-create the RAR compression algorithm, which is proprietary. + Distribution of modified UnRAR sources in separate form or as a + part of other software is permitted, provided that it is clearly + stated in the documentation and source comments that the code may + not be used to develop a RAR (WinRAR) compatible archiver. + + 3. The UnRAR utility may be freely distributed. It is allowed + to distribute UnRAR inside of other software packages. + + 4. THE RAR ARCHIVER AND THE UnRAR UTILITY ARE DISTRIBUTED "AS IS". + NO WARRANTY OF ANY KIND IS EXPRESSED OR IMPLIED. YOU USE AT + YOUR OWN RISK. THE AUTHOR WILL NOT BE LIABLE FOR DATA LOSS, + DAMAGES, LOSS OF PROFITS OR ANY OTHER KIND OF LOSS WHILE USING + OR MISUSING THIS SOFTWARE. + + 5. Installing and using the UnRAR utility signifies acceptance of + these terms and conditions of the license. + + 6. If you don't agree with terms of the license you must remove + UnRAR files from your storage devices and cease to use the + utility. + + Thank you for your interest in RAR and UnRAR. Alexander L. Roshal Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import java.util.Locale; + +/** + * Utility class to allow for conversion from an integer to Roman numerals + * or alpha-numeric symbols in line with Pages auto numbering formats. + */ + class AutoPageNumberUtils { + + private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G", + "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", + "U", "V", "W", "X", "Y", "Z" }; + + private static final int MAX = 26; + + public static String asAlphaNumeric(int i) { + StringBuffer sbuff = new StringBuffer(); + int index = i % MAX; + int ratio = i / MAX; + + if (index == 0) { + ratio--; + index = MAX; + } + + for(int j = 0; j <= ratio; j++) { + sbuff.append(ALPHABET[index - 1]); } + return sbuff.toString(); + } + + public static String asAlphaNumericLower(int i) { + return asAlphaNumeric(i).toLowerCase(Locale.ROOT); + } + + /* + * Code copied from jena.apache.org. + * @see com.hp.hpl.jena.sparql.util.RomanNumeral + */ + public static String asRomanNumerals(int i) { + if ( i <= 0 ) + throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ; + if ( i > 3999 ) + throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ; + StringBuffer sbuff = new StringBuffer() ; + + i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ; + i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40 ) ; + i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4) ; + + while ( i >= 1 ) + { + sbuff.append("I") ; + i -= 1 ; + } + return sbuff.toString() ; + + + } + + public static String asRomanNumeralsLower(int i) { + return asRomanNumerals(i).toLowerCase(Locale.ROOT); + } + + private static int i2r(StringBuffer sbuff, int i, + String tens, int iTens, + String nines, int iNines, + String fives, int iFives, + String fours, int iFours) + { + while ( i >= iTens ) + { + sbuff.append(tens) ; + i -= iTens ; + } + + if ( i >= iNines ) + { + sbuff.append(nines) ; + i -= iNines; + } + + if ( i >= iFives ) + { + sbuff.append(fives) ; + i -= iFives ; + } + if ( i >= iFours ) + { + sbuff.append(fours) ; + i -= iFours ; + } + return i ; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import javax.xml.namespace.QName; + +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.detect.XmlRootExtractor; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files. + * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content. + * + * Currently supported formats: + * <ol> + * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x + * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x + * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x + * </ol> + */ +public class IWorkPackageParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -2160322853809682372L; + + /** + * Which files within an iWork file contain the actual content? + */ + public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet( + new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl")) + ); + /** + * All iWork files contain one of these, so we can detect based on it + */ + public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist"; + + public enum IWORKDocumentType { + KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")), + NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")), + PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")), + ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected")); + + private final String namespace; + private final String part; + private final MediaType type; + + IWORKDocumentType(String namespace, String part, MediaType type) { + this.namespace = namespace; + this.part = part; + this.type = type; + } + + public String getNamespace() { + return namespace; + } + + public String getPart() { + return part; + } + + public MediaType getType() { + return type; + } + + public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) { + try { + if (entry == null) { + return null; + } + + try (InputStream stream = zip.getInputStream(entry)) { + return detectType(stream); + } + } catch (IOException e) { + return null; + } + } + + public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) { + if (entry == null) { + return null; + } + + return detectType(zip); + } + + private static IWORKDocumentType detectType(InputStream stream) { + QName qname = new XmlRootExtractor().extractRootElement(stream); + if (qname != null) { + String uri = qname.getNamespaceURI(); + String local = qname.getLocalPart(); + + for (IWORKDocumentType type : values()) { + if(type.getNamespace().equals(uri) && + type.getPart().equals(local)) { + return type; + } + } + } else { + // There was a problem with extracting the root type + // Password Protected iWorks files are funny, but we can usually + // spot them because they encrypt part of the zip stream + try { + stream.read(); + } catch(UnsupportedZipFeatureException e) { + // Compression field was likely encrypted + return ENCRYPTED; + } catch(Exception ignored) { + } + } + return null; + } + } + + /** + * This parser handles all iWorks formats. + */ + private final static Set<MediaType> supportedTypes = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.apple.iwork"), + IWORKDocumentType.KEYNOTE.getType(), + IWORKDocumentType.NUMBERS.getType(), + IWORKDocumentType.PAGES.getType() + ))); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return supportedTypes; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + ZipArchiveInputStream zip = new ZipArchiveInputStream(stream); + ZipArchiveEntry entry = zip.getNextZipEntry(); + + while (entry != null) { + if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) { + entry = zip.getNextZipEntry(); + continue; + } + + InputStream entryStream = new BufferedInputStream(zip, 4096); + entryStream.mark(4096); + IWORKDocumentType type = IWORKDocumentType.detectType(entryStream); + entryStream.reset(); + + if(type != null) { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + ContentHandler contentHandler; + + switch(type) { + case KEYNOTE: + contentHandler = new KeynoteContentHandler(xhtml, metadata); + break; + case NUMBERS: + contentHandler = new NumbersContentHandler(xhtml, metadata); + break; + case PAGES: + contentHandler = new PagesContentHandler(xhtml, metadata); + break; + case ENCRYPTED: + // We can't do anything for the file right now + contentHandler = null; + break; + default: + throw new TikaException("Unhandled iWorks file " + type); + } + + metadata.add(Metadata.CONTENT_TYPE, type.getType().toString()); + xhtml.startDocument(); + if (contentHandler != null) { + context.getSAXParser().parse( + new CloseShieldInputStream(entryStream), + new OfflineContentHandler(contentHandler) + ); + } + xhtml.endDocument(); + } + + entry = zip.getNextZipEntry(); + } + // Don't close the zip InputStream (TIKA-1117). + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +class KeynoteContentHandler extends DefaultHandler { + + public final static String PRESENTATION_WIDTH = "slides-width"; + public final static String PRESENTATION_HEIGHT = "slides-height"; + + private final XHTMLContentHandler xhtml; + private final Metadata metadata; + + private boolean inSlide = false; + private boolean inTheme = false; + private boolean inTitle = false; + private boolean inBody = false; + private String tableId; + private Integer numberOfColumns = null; + private Integer currentColumn = null; + + private boolean inMetadata = false; + private boolean inMetaDataTitle = false; + private boolean inMetaDataAuthors = false; + + private boolean inParsableText = false; + + private int numberOfSlides = 0; + + KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) { + this.xhtml = xhtml; + this.metadata = metadata; + } + + @Override + public void endDocument() throws SAXException { + metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides)); + } + + @Override + public void startElement( + String uri, String localName, String qName, Attributes attributes) + throws SAXException { + if ("key:theme".equals(qName)) { + inTheme = true; + } else if ("key:slide".equals(qName)) { + inSlide = true; + numberOfSlides++; + xhtml.startElement("div"); + } else if ("key:master-slide".equals(qName)) { + inSlide = true; + xhtml.startElement("div"); + } else if ("key:title-placeholder".equals(qName) && inSlide) { + inTitle = true; + xhtml.startElement("h1"); + } else if ("sf:sticky-note".equals(qName) && inSlide) { + xhtml.startElement("p"); + } else if ("key:notes".equals(qName) && inSlide) { + xhtml.startElement("p"); + } else if ("key:body-placeholder".equals(qName) && inSlide) { + xhtml.startElement("p"); + inBody = true; + } else if ("key:size".equals(qName) && !inTheme) { + String width = attributes.getValue("sfa:w"); + String height = attributes.getValue("sfa:h"); + metadata.set(PRESENTATION_WIDTH, width); + metadata.set(PRESENTATION_HEIGHT, height); + } else if ("sf:text-body".equals(qName)) { + inParsableText = true; + } else if ("key:metadata".equals(qName)) { + inMetadata = true; + } else if (inMetadata && "key:title".equals(qName)) { + inMetaDataTitle = true; + } else if (inMetadata && "key:authors".equals(qName)) { + inMetaDataAuthors = true; + } else if (inMetaDataTitle && "key:string".equals(qName)) { + metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string")); + } else if (inMetaDataAuthors && "key:string".equals(qName)) { + metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string")); + } else if (inSlide && "sf:tabular-model".equals(qName)) { + tableId = attributes.getValue("sfa:ID"); + xhtml.startElement("table"); + } else if (tableId != null && "sf:columns".equals(qName)) { + numberOfColumns = Integer.parseInt(attributes.getValue("sf:count")); + currentColumn = 0; + } else if (tableId != null && "sf:ct".equals(qName)) { + parseTableData(attributes.getValue("sfa:s")); + } else if (tableId != null && "sf:n".equals(qName)) { + parseTableData(attributes.getValue("sf:v")); + } else if ("sf:p".equals(qName)) { + xhtml.startElement("p"); + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if ("key:theme".equals(qName)) { + inTheme = false; + } else if ("key:slide".equals(qName)) { + inSlide = false; + xhtml.endElement("div"); + } else if ("key:master-slide".equals(qName)) { + inSlide = false; + xhtml.endElement("div"); + } else if ("key:title-placeholder".equals(qName) && inSlide) { + inTitle = false; + xhtml.endElement("h1"); + } else if ("sf:sticky-note".equals(qName) && inSlide) { + xhtml.endElement("p"); + } else if ("key:notes".equals(qName) && inSlide) { + xhtml.endElement("p"); + } else if ("key:body-placeholder".equals(qName) && inSlide) { + xhtml.endElement("p"); + inBody = false; + } else if ("sf:text-body".equals(qName)) { + inParsableText = false; + } else if ("key:metadata".equals(qName)) { + inMetadata = false; + } else if (inMetadata && "key:title".equals(qName)) { + inMetaDataTitle = false; + } else if (inMetadata && "key:authors".equals(qName)) { + inMetaDataAuthors = false; + } else if (inSlide && "sf:tabular-model".equals(qName)) { + xhtml.endElement("table"); + tableId = null; + numberOfColumns = null; + currentColumn = null; + } else if ("sf:p".equals(qName)) { + xhtml.endElement("p"); + } + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + if (inParsableText && inSlide && length != 0) { + xhtml.characters(ch, start, length); + } + } + + private void parseTableData(String value) throws SAXException { + if (currentColumn == 0) { + xhtml.startElement("tr"); + } + + xhtml.element("td", value); + + if (currentColumn.equals(numberOfColumns)) { + xhtml.endElement("tr"); + } + } + +}
