Merge branch 'master' into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d50a6936 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d50a6936 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d50a6936 Branch: refs/heads/master Commit: d50a69361bd0196fb2595313cb47222f61701ba4 Parents: a1250ff 07aea36 Author: Lewis John McGibbney <[email protected]> Authored: Wed Sep 21 08:06:47 2016 -0700 Committer: Lewis John McGibbney <[email protected]> Committed: Wed Sep 21 08:06:47 2016 -0700 ---------------------------------------------------------------------- CHANGES.txt | 30 + tika-bundle/pom.xml | 2 +- .../main/java/org/apache/tika/config/Field.java | 45 + .../org/apache/tika/config/Initializable.java | 33 + .../main/java/org/apache/tika/config/Param.java | 191 +++++ .../java/org/apache/tika/config/ParamField.java | 158 ++++ .../java/org/apache/tika/config/TikaConfig.java | 47 +- .../tika/exception/TikaConfigException.java | 39 + .../org/apache/tika/parser/AbstractParser.java | 10 + .../java/org/apache/tika/parser/Parser.java | 1 + .../tika/parser/external/ExternalParser.java | 85 +- .../apache/tika/sax/XHTMLContentHandler.java | 5 +- .../org/apache/tika/utils/AnnotationUtils.java | 138 +++ .../apache/tika/utils/ServiceLoaderUtils.java | 30 + .../org/apache/tika/mime/tika-mimetypes.xml | 67 +- .../java/org/apache/tika/config/ParamTest.java | 71 ++ .../tika/parser/DummyInitializableParser.java | 68 ++ .../tika/parser/DummyParameterizedParser.java | 113 +++ .../tika/parser/InitializableParserTest.java | 45 + .../tika/parser/ParameterizedParserTest.java | 125 +++ .../apache/tika/utils/AnnotationUtilsTest.java | 190 +++++ .../tika/config/TIKA-1508-configurable.xml | 37 + .../tika/config/TIKA-1986-bad-parameters.xml | 26 + .../apache/tika/config/TIKA-1986-bad-types.xml | 26 + .../apache/tika/config/TIKA-1986-bad-values.xml | 26 + .../tika/config/TIKA-1986-initializable.xml | 28 + .../TIKA-1986-parameterized-decorated.xml | 39 + .../tika/config/TIKA-1986-parameterized.xml | 38 + .../tika/config/TIKA-1986-some-parameters.xml | 28 + tika-parent/pom.xml | 12 +- tika-parsers/pom.xml | 26 +- .../chm/accessor/ChmDirectoryListingSet.java | 11 +- .../apache/tika/parser/chm/core/ChmCommons.java | 5 +- .../tika/parser/chm/core/ChmExtractor.java | 4 +- .../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 4 +- .../tika/parser/mail/MailContentHandler.java | 13 +- .../org/apache/tika/parser/mat/MatParser.java | 5 + .../tika/parser/microsoft/ExcelExtractor.java | 34 +- .../microsoft/TikaExcelDataFormatter.java | 41 + .../microsoft/TikaExcelGeneralFormat.java | 90 ++ .../tika/parser/microsoft/WordExtractor.java | 20 + .../microsoft/ooxml/MetadataExtractor.java | 15 +- .../ooxml/XSSFExcelExtractorDecorator.java | 20 +- .../ooxml/XWPFWordExtractorDecorator.java | 52 +- .../microsoft/xml/AbstractXML2003Parser.java | 4 + .../tika/parser/microsoft/xml/WordMLParser.java | 3 + .../tika/parser/ocr/TesseractOCRConfig.java | 181 +++- .../tika/parser/ocr/TesseractOCRParser.java | 113 ++- .../parser/odf/OpenDocumentContentParser.java | 3 + .../tika/parser/pdf/AbstractPDF2XHTML.java | 16 +- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 1 - .../org/apache/tika/parser/pdf/PDFParser.java | 29 +- .../apache/tika/parser/pdf/PDFParserConfig.java | 86 +- .../parser/recognition/ObjectRecogniser.java | 75 ++ .../recognition/ObjectRecognitionParser.java | 171 ++++ .../parser/recognition/RecognisedObject.java | 91 ++ .../tf/TensorflowImageRecParser.java | 152 ++++ .../tf/TensorflowRESTRecogniser.java | 142 ++++ .../apache/tika/parser/txt/CharsetDetector.java | 416 +++++---- .../apache/tika/parser/txt/CharsetMatch.java | 139 ++- .../tika/parser/txt/CharsetRecog_2022.java | 28 +- .../tika/parser/txt/CharsetRecog_UTF8.java | 24 +- .../tika/parser/txt/CharsetRecog_Unicode.java | 99 ++- .../tika/parser/txt/CharsetRecog_mbcs.java | 44 +- .../tika/parser/txt/CharsetRecog_sbcs.java | 835 ++++++++++--------- .../tika/parser/txt/CharsetRecognizer.java | 31 +- .../parser/ocr/TesseractOCRConfig.properties | 13 +- .../org/apache/tika/parser/ocr/rotation.py | 72 ++ .../recognition/tf/InceptionRestDockerfile | 41 + .../parser/recognition/tf/classify_image.py | 212 +++++ .../tika/parser/recognition/tf/inceptionapi.py | 319 +++++++ .../org/apache/tika/mime/TestMimeTypes.java | 13 + .../tika/parser/chm/TestChmExtractor.java | 21 +- .../apache/tika/parser/html/HtmlParserTest.java | 140 +++- .../tika/parser/mail/RFC822ParserTest.java | 68 +- .../apache/tika/parser/mbox/MboxParserTest.java | 1 - .../tika/parser/microsoft/ExcelParserTest.java | 10 + .../tika/parser/microsoft/WordParserTest.java | 11 +- .../parser/microsoft/ooxml/OOXMLParserTest.java | 28 +- .../parser/microsoft/xml/XML2003ParserTest.java | 1 + .../tika/parser/ocr/TesseractOCRConfigTest.java | 61 +- .../tika/parser/ocr/TesseractOCRParserTest.java | 18 +- .../apache/tika/parser/pdf/PDFParserTest.java | 44 +- .../ObjectRecognitionParserTest.java | 89 ++ .../tf/TensorflowImageRecParserTest.java | 58 ++ .../parser/pdf/tika-config-non-primitives.xml | 29 + .../org/apache/tika/parser/pdf/tika-config.xml | 26 + .../recognition/tika-config-tflow-rest.xml | 30 + .../parser/recognition/tika-config-tflow.xml | 29 + .../resources/test-documents/testChm_oom.chm | Bin 0 -> 4315 bytes .../test-documents/testEXCEL_big_numbers.xls | Bin 0 -> 26112 bytes .../test-documents/testEXCEL_big_numbers.xlsx | Bin 0 -> 8396 bytes .../test-documents/testEmailWithPNGAtt.eml | 354 ++++++++ .../resources/test-documents/testHTML_head.html | 32 + .../test-documents/testOpenOffice2.odt | Bin 26448 -> 26460 bytes .../resources/test-documents/testStataDTA.dta | Bin 0 -> 1207 bytes .../resources/test-documents/testStataDTA.txt | 15 + .../resources/test-documents/testWORD2003.xml | 2 +- .../test-documents/testWORD_boldHyperlink.doc | Bin 0 -> 27136 bytes .../test-documents/testWORD_boldHyperlink.docx | Bin 0 -> 12382 bytes .../testWORD_totalTimeOutOfRange.docx | Bin 0 -> 11047 bytes .../TesseractOCRConfig-full.properties | 6 + .../TesseractOCRConfig-partial.properties | 8 +- tika-translate/pom.xml | 2 +- 104 files changed, 5612 insertions(+), 917 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/d50a6936/tika-parsers/pom.xml ----------------------------------------------------------------------
