Merge master into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/fe559b80 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/fe559b80 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/fe559b80 Branch: refs/heads/master Commit: fe559b80bcad1f107904ca7a89724a26ea2921a1 Parents: 4aff483 23a11ef Author: Lewis John McGibbney <[email protected]> Authored: Fri Jul 1 13:35:52 2016 -0700 Committer: Lewis John McGibbney <[email protected]> Committed: Fri Jul 1 13:35:52 2016 -0700 ---------------------------------------------------------------------- CHANGES.txt | 38 +- pom.xml | 2 +- tika-app/pom.xml | 2 +- .../main/java/org/apache/tika/cli/TikaCLI.java | 9 +- tika-batch/pom.xml | 2 +- tika-bundle/pom.xml | 2 +- tika-core/pom.xml | 2 +- .../org/apache/tika/detect/NameDetector.java | 15 +- .../tika/detect/ZeroSizeFileDetector.java | 45 + .../java/org/apache/tika/io/EndianUtils.java | 829 +++--- .../tika/metadata/TikaCoreProperties.java | 7 + .../java/org/apache/tika/mime/MediaType.java | 3 + .../org/apache/tika/mime/MediaTypeRegistry.java | 2 + .../org/apache/tika/mime/tika-mimetypes.xml | 69 +- .../java/org/apache/tika/TikaDetectionTest.java | 2 +- .../src/test/java/org/apache/tika/TikaTest.java | 6 +- .../apache/tika/detect/NameDetectorTest.java | 10 + .../tika/detect/ZeroSizeFileDetectorTest.java | 64 + .../org/apache/tika/io/EndianUtilsTest.java | 35 + tika-example/pom.xml | 2 +- tika-java7/pom.xml | 2 +- tika-langdetect/pom.xml | 3 +- ...apache.tika.language.detect.LanguageDetector | 15 + tika-parent/pom.xml | 4 +- tika-parsers/pom.xml | 4 +- .../parser/apple/AppleSingleFileParser.java | 205 ++ .../org/apache/tika/parser/dbf/DBFCell.java | 147 + .../apache/tika/parser/dbf/DBFColumnHeader.java | 97 + .../apache/tika/parser/dbf/DBFFileHeader.java | 144 + .../org/apache/tika/parser/dbf/DBFParser.java | 155 ++ .../org/apache/tika/parser/dbf/DBFReader.java | 207 ++ .../java/org/apache/tika/parser/dbf/DBFRow.java | 62 + .../apache/tika/parser/geo/topic/GeoParser.java | 14 +- .../tika/parser/html/HtmlEncodingDetector.java | 16 +- .../apache/tika/parser/html/HtmlHandler.java | 3 + .../tika/parser/image/xmp/JempboxExtractor.java | 30 + .../iwork/iwana/IWork13PackageParser.java | 86 + .../tika/parser/mail/MailContentHandler.java | 110 +- .../microsoft/AbstractPOIFSExtractor.java | 32 +- .../tika/parser/microsoft/HSLFExtractor.java | 32 +- .../parser/microsoft/JackcessExtractor.java | 4 +- .../parser/microsoft/MSOwnerFileParser.java | 81 + .../tika/parser/microsoft/OfficeParser.java | 2 +- .../tika/parser/microsoft/WordExtractor.java | 22 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 12 +- .../ooxml/XSLFPowerPointExtractorDecorator.java | 58 +- .../ooxml/XSSFExcelExtractorDecorator.java | 99 +- .../microsoft/xml/AbstractXML2003Parser.java | 128 + .../parser/microsoft/xml/HyperlinkHandler.java | 96 + .../microsoft/xml/SpreadsheetMLParser.java | 175 ++ .../tika/parser/microsoft/xml/WordMLParser.java | 306 +++ .../parser/ner/grobid/GrobidNERecogniser.java | 28 +- .../tika/parser/ocr/TesseractOCRParser.java | 87 +- .../tika/parser/pdf/AbstractPDF2XHTML.java | 578 ++++ .../org/apache/tika/parser/pdf/OCR2XHTML.java | 127 + .../org/apache/tika/parser/pdf/PDF2XHTML.java | 518 +--- .../org/apache/tika/parser/pdf/PDFParser.java | 7 + .../apache/tika/parser/pdf/PDFParserConfig.java | 274 +- .../tika/parser/pkg/ZipContainerDetector.java | 12 + .../tika/parser/rtf/RTFEmbObjHandler.java | 7 +- .../tika/parser/rtf/RTFObjDataParser.java | 43 +- .../apache/tika/parser/rtf/TextExtractor.java | 11 +- .../services/org.apache.tika.parser.Parser | 7 +- .../apache/tika/parser/pdf/PDFParser.properties | 10 +- .../tika/detect/TestContainerAwareDetector.java | 11 + .../org/apache/tika/mime/TestMimeTypes.java | 38 +- .../parser/apple/AppleSingleFileParserTest.java | 46 + .../apache/tika/parser/dbf/DBFParserTest.java | 158 ++ .../apache/tika/parser/html/HtmlParserTest.java | 60 +- .../parser/image/xmp/JempboxExtractorTest.java | 29 +- .../tika/parser/mail/RFC822ParserTest.java | 115 + .../tika/parser/microsoft/ExcelParserTest.java | 28 +- .../parser/microsoft/MSOwnerFileParserTest.java | 31 + .../microsoft/POIContainerExtractionTest.java | 4 +- .../parser/microsoft/PowerPointParserTest.java | 13 +- .../tika/parser/microsoft/WordParserTest.java | 19 + .../ooxml/OOXMLContainerExtractionTest.java | 2 +- .../parser/microsoft/ooxml/OOXMLParserTest.java | 43 +- .../parser/microsoft/xml/XML2003ParserTest.java | 109 + .../apache/tika/parser/pdf/PDFParserTest.java | 74 +- .../apache/tika/parser/rtf/RTFParserTest.java | 127 +- .../test-documents/testAppleSingleFile.pdf | Bin 0 -> 1893 bytes .../test/resources/test-documents/testDBF.dbf | Bin 0 -> 890 bytes .../test-documents/testDBF_gb18030.dbf | Bin 0 -> 144 bytes .../test/resources/test-documents/testDJVU.djvu | Bin 0 -> 89 bytes .../resources/test-documents/testEXCEL2003.xml | 100 + .../test-documents/testEXCEL_hyperlinks.xls | Bin 0 -> 29696 bytes .../test-documents/testEXCEL_hyperlinks.xlsx | Bin 0 -> 10038 bytes .../resources/test-documents/testEXCEL_poi.xlsx | Bin 0 -> 3360 bytes .../test-documents/testEndNoteImportFile.enw | 10 + .../test-documents/testExcel_embeddedPDF.xls | Bin 0 -> 38400 bytes .../test-documents/testExcel_embeddedPDF.xlsx | Bin 0 -> 25602 bytes .../resources/test-documents/testICalendar.ics | 15 + .../test-documents/testKeynote2013.key | Bin 0 -> 274397 bytes .../resources/test-documents/testKeynoteNew.key | Bin 274397 -> 0 bytes .../resources/test-documents/testMSOwnerFile | Bin 0 -> 162 bytes .../test-documents/testNumbers2013.numbers | Bin 0 -> 179147 bytes .../test-documents/testNumbersNew.numbers | Bin 179147 -> 0 bytes .../test-documents/testPPT_EmbeddedPDF.ppt | Bin 0 -> 187392 bytes .../test-documents/testPPT_EmbeddedPDF.pptx | Bin 0 -> 108637 bytes .../test-documents/testPages2013.pages | Bin 0 -> 237567 bytes .../resources/test-documents/testPagesNew.pages | Bin 237567 -> 0 bytes .../test-documents/testRFC822_date_utf8 | 8 + .../resources/test-documents/testRFC822_eml | 33 + .../resources/test-documents/testVCalendar.vcs | 10 + .../resources/test-documents/testWORD2003.xml | 2542 ++++++++++++++++++ .../test-documents/testWindowsMediaMeta.asx | 6 + .../test/resources/test-documents/testXMP.xmp | 178 ++ .../test-documents/test_recursive_embedded.doc | Bin 0 -> 31744 bytes tika-serialization/pom.xml | 2 +- tika-server/pom.xml | 2 +- tika-translate/pom.xml | 2 +- .../translate/translator.yandex.properties | 2 +- tika-xmp/pom.xml | 2 +- 114 files changed, 7822 insertions(+), 1203 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/fe559b80/tika-parsers/pom.xml ----------------------------------------------------------------------
