merged upstream changes and resolved conflicts
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e780d566 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e780d566 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e780d566 Branch: refs/heads/master Commit: e780d56652d48dd0f50b4e62a58153e95f055022 Parents: 0d69ca7 bb46c0e Author: Thamme Gowda <[email protected]> Authored: Mon May 23 11:30:13 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Mon May 23 11:30:13 2016 -0700 ---------------------------------------------------------------------- CHANGES.txt | 85 +- pom.xml | 3 +- tika-app/pom.xml | 7 +- .../main/appended-resources/META-INF/LICENSE | 227 - .../tika/cli/BatchCommandLineBuilder.java | 7 - .../main/java/org/apache/tika/cli/TikaCLI.java | 50 +- .../main/resources/tika-app-batch-config.xml | 10 +- .../tika/cli/TikaCLIBatchCommandLineTest.java | 1 - .../java/org/apache/tika/cli/TikaCLITest.java | 16 - tika-batch/pom.xml | 4 +- .../batch/builders/BatchProcessBuilder.java | 15 +- .../builders/CommandLineParserBuilder.java | 16 +- .../apache/tika/batch/fs/FSBatchProcessCLI.java | 4 +- .../builders/BasicTikaFSConsumersBuilder.java | 51 +- .../tika/batch/fs/default-tika-batch-config.xml | 50 +- .../apache/tika/batch/fs/BatchProcessTest.java | 19 +- .../tika/batch/fs/HandlerBuilderTest.java | 4 - .../tika-batch-config-MockConsumersBuilder.xml | 2 +- .../test/resources/tika-batch-config-broken.xml | 2 +- .../tika-batch-config-test-suffix-override.xml | 112 + .../test/resources/tika-batch-config-test.xml | 2 +- tika-bundle/pom.xml | 6 +- .../main/appended-resources/META-INF/LICENSE | 226 - tika-core/pom.xml | 7 +- .../java/org/apache/tika/config/TikaConfig.java | 26 +- .../tika/config/TikaConfigSerializer.java | 4 +- .../org/apache/tika/detect/NameDetector.java | 15 +- .../tika/detect/ZeroSizeFileDetector.java | 45 + .../java/org/apache/tika/fork/ForkClient.java | 10 +- .../tika/language/LanguageIdentifier.java | 7 +- .../apache/tika/language/LanguageProfile.java | 2 + .../tika/language/LanguageProfilerBuilder.java | 9 +- .../apache/tika/language/ProfilingHandler.java | 3 +- .../apache/tika/language/ProfilingWriter.java | 2 + .../language/detect/LanguageConfidence.java | 25 + .../tika/language/detect/LanguageDetector.java | 239 + .../tika/language/detect/LanguageHandler.java | 66 + .../tika/language/detect/LanguageNames.java | 86 + .../tika/language/detect/LanguageResult.java | 98 + .../tika/language/detect/LanguageWriter.java | 78 + .../org/apache/tika/language/package-info.java | 22 - .../tika/metadata/TikaCoreProperties.java | 9 + .../java/org/apache/tika/mime/MediaType.java | 3 + .../org/apache/tika/mime/MediaTypeRegistry.java | 2 + .../org/apache/tika/mime/MimeTypesReader.java | 20 +- .../org/apache/tika/parser/NetworkParser.java | 4 +- .../org/apache/tika/parser/ParseContext.java | 169 +- .../org/apache/tika/parser/ParserDecorator.java | 35 +- .../tika/parser/external/ExternalParser.java | 8 +- .../external/ExternalParsersConfigReader.java | 11 +- .../tika/sax/BasicContentHandlerFactory.java | 8 + .../src/main/java/org/apache/tika/sax/Link.java | 4 + .../java/org/apache/tika/sax/LinkBuilder.java | 6 +- .../org/apache/tika/sax/LinkContentHandler.java | 18 +- .../resources/org/apache/tika/language/be.ngp | 0 .../resources/org/apache/tika/language/ca.ngp | 0 .../resources/org/apache/tika/language/eo.ngp | 0 .../resources/org/apache/tika/language/gl.ngp | 0 .../resources/org/apache/tika/language/ro.ngp | 0 .../resources/org/apache/tika/language/sk.ngp | 0 .../resources/org/apache/tika/language/sl.ngp | 0 .../resources/org/apache/tika/language/uk.ngp | 0 .../org/apache/tika/mime/tika-mimetypes.xml | 93 +- .../src/test/java/org/apache/tika/TikaTest.java | 59 +- .../apache/tika/detect/NameDetectorTest.java | 10 + .../tika/detect/ZeroSizeFileDetectorTest.java | 64 + .../tika/language/LanguageIdentifierTest.java | 1 + .../tika/language/LanguageProfileTest.java | 7 +- .../language/LanguageProfilerBuilderTest.java | 1 + .../tika/language/ProfilingWriterTest.java | 5 +- .../tika/language/detect/LanguageNamesTest.java | 38 + .../org/apache/tika/parser/mock/MockParser.java | 12 +- .../apache/tika/sax/LinkContentHandlerTest.java | 36 +- .../tika/language/langbuilder/welsh_corpus.txt | 5204 +++++++++--------- tika-example/pom.xml | 16 +- .../java/org/apache/tika/example/Language.java | 32 +- .../tika/example/LanguageDetectingParser.java | 16 +- .../tika/example/LanguageDetectorExample.java | 33 + .../tika/example/LanguageIdentifierExample.java | 27 - .../org/apache/tika/example/MyFirstTika.java | 13 +- .../org/apache/tika/example/ParsingExample.java | 14 +- .../example/LanguageDetectorExampleTest.java | 39 + .../example/LanguageIdentifierExampleTest.java | 37 - tika-java7/pom.xml | 2 +- tika-langdetect/pom.xml | 171 + .../tika/langdetect/OptimaizeLangDetector.java | 196 + .../tika/langdetect/TextLangDetector.java | 146 + ...apache.tika.language.detect.LanguageDetector | 16 + .../tika/langdetect/LanguageDetectorTest.java | 92 + .../langdetect/OptimaizeLangDetectorTest.java | 265 + .../tika/langdetect/TextLangDetectorTest.java | 59 + .../src/test/resources/log4j.properties | 24 + .../apache/tika/langdetect/language-codes.txt | 186 + .../tika/langdetect/language-tests/da.test | 108 + .../tika/langdetect/language-tests/de.test | 104 + .../tika/langdetect/language-tests/el.test | 109 + .../tika/langdetect/language-tests/en.test | 105 + .../tika/langdetect/language-tests/es.test | 107 + .../tika/langdetect/language-tests/et.test | 17 + .../tika/langdetect/language-tests/fi.test | 106 + .../tika/langdetect/language-tests/fr.test | 105 + .../tika/langdetect/language-tests/it.test | 109 + .../tika/langdetect/language-tests/ja.test | 78 + .../tika/langdetect/language-tests/lt.test | 32 + .../tika/langdetect/language-tests/nl.test | 105 + .../tika/langdetect/language-tests/pt.test | 105 + .../tika/langdetect/language-tests/sv.test | 108 + .../tika/langdetect/language-tests/th.test | 28 + .../tika/langdetect/language-tests/zh.test | 57 + .../org/apache/tika/langdetect/text-test.tsv | 18 + .../org/apache/tika/langdetect/udhr-known.txt | 11 + .../org/apache/tika/langdetect/udhr-unknown.txt | 4 + tika-parent/pom.xml | 29 +- tika-parsers/pom.xml | 53 +- .../tika/parser/code/SourceCodeParser.java | 142 +- .../tika/parser/epub/EpubContentParser.java | 33 +- .../org/apache/tika/parser/epub/EpubParser.java | 8 +- .../parser/executable/ExecutableParser.java | 2 +- .../tika/parser/font/AdobeFontMetricParser.java | 16 +- .../apache/tika/parser/font/TrueTypeParser.java | 4 +- .../geoinfo/GeographicInformationParser.java | 30 +- .../apache/tika/parser/html/HtmlHandler.java | 3 + .../apache/tika/parser/image/ICNSParser.java | 117 + .../org/apache/tika/parser/image/ICNSType.java | 170 + .../parser/image/ImageMetadataExtractor.java | 45 +- .../tika/parser/image/xmp/JempboxExtractor.java | 75 +- .../tika/parser/isatab/ISArchiveParser.java | 62 +- .../tika/parser/jdbc/AbstractDBParser.java | 13 +- .../tika/parser/jdbc/JDBCTableReader.java | 68 +- .../tika/parser/jdbc/SQLite3DBParser.java | 31 +- .../apache/tika/parser/jdbc/SQLite3Parser.java | 6 +- .../tika/parser/jdbc/SQLite3TableReader.java | 45 +- .../apache/tika/parser/journal/TEIParser.java | 8 +- .../tika/parser/mail/MailContentHandler.java | 110 +- .../org/apache/tika/parser/mat/MatParser.java | 27 +- .../tika/parser/microsoft/HSLFExtractor.java | 14 + .../tika/parser/microsoft/OfficeParser.java | 3 +- .../microsoft/POIFSContainerDetector.java | 21 +- .../tika/parser/microsoft/WordExtractor.java | 11 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 22 +- .../ooxml/XSLFPowerPointExtractorDecorator.java | 58 +- .../ooxml/XSSFExcelExtractorDecorator.java | 112 +- .../microsoft/xml/AbstractXML2003Parser.java | 86 + .../parser/microsoft/xml/HyperlinkHandler.java | 96 + .../microsoft/xml/SpreadsheetMLParser.java | 161 + .../tika/parser/microsoft/xml/WordMLParser.java | 229 + .../parser/mp4/DirectFileReadDataSource.java | 34 +- .../org/apache/tika/parser/mp4/MP4Parser.java | 379 +- .../parser/ner/grobid/GrobidNERecogniser.java | 240 + .../parser/ner/mitie/MITIENERecogniser.java | 160 + .../tika/parser/ner/nltk/NLTKNERecogniser.java | 19 +- .../apache/tika/parser/netcdf/NetCDFParser.java | 20 +- .../parser/odf/OpenDocumentContentParser.java | 37 +- .../tika/parser/odf/OpenDocumentParser.java | 62 +- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 244 +- .../parser/pdf/PDFEncodedStringDecoder.java | 14 +- .../org/apache/tika/parser/pdf/PDFParser.java | 143 +- .../apache/tika/parser/pdf/PDFParserConfig.java | 67 +- .../apache/tika/parser/pdf/XFAExtractor.java | 30 +- .../tika/parser/pot/PooledTimeSeriesParser.java | 394 +- .../services/org.apache.tika.parser.Parser | 5 +- .../parser/ner/grobid/GrobidServer.properties | 17 + .../apache/tika/parser/pdf/PDFParser.properties | 4 +- .../org/apache/tika/mime/TestMimeTypes.java | 21 +- .../parser/executable/ExecutableParserTest.java | 73 +- .../GeographicInformationParserTest.java | 48 +- .../apache/tika/parser/html/HtmlParserTest.java | 38 + .../tika/parser/image/ICNSParserTest.java | 65 + .../tika/parser/image/ImageParserTest.java | 5 +- .../tika/parser/jdbc/SQLite3ParserTest.java | 106 +- .../apache/tika/parser/jpeg/JpegParserTest.java | 21 +- .../tika/parser/mail/RFC822ParserTest.java | 115 + .../AbstractPOIContainerExtractionTest.java | 4 +- .../tika/parser/microsoft/ExcelParserTest.java | 18 +- .../microsoft/POIContainerExtractionTest.java | 35 +- .../parser/microsoft/PowerPointParserTest.java | 2 +- .../ooxml/OOXMLContainerExtractionTest.java | 23 +- .../parser/microsoft/ooxml/OOXMLParserTest.java | 24 +- .../parser/microsoft/xml/XML2003ParserTest.java | 81 + .../apache/tika/parser/mp4/MP4ParserTest.java | 12 +- .../apache/tika/parser/odf/ODFParserTest.java | 10 +- .../apache/tika/parser/pdf/PDFParserTest.java | 562 +- .../resources/test-documents/testEXCEL2003.xml | 100 + .../test-documents/testEXCEL_hyperlinks.xls | Bin 0 -> 29696 bytes .../test-documents/testEXCEL_hyperlinks.xlsx | Bin 0 -> 10038 bytes .../test/resources/test-documents/testHFA.hfa | Bin 0 -> 1024 bytes .../test/resources/test-documents/testICNS.icns | Bin 0 -> 2472 bytes .../test-documents/testICNS_basic.icns | Bin 0 -> 18199 bytes .../resources/test-documents/testKeynoteNew.key | Bin 0 -> 274397 bytes .../test/resources/test-documents/testMIF.mif | Bin 0 -> 10240 bytes .../test-documents/testMP4_truncated.m4a | Bin 0 -> 74 bytes .../testMSChart-govdocs-428996.ppt | Bin 0 -> 41472 bytes .../testMSChart-govdocs-428996.pptx | Bin 0 -> 56224 bytes .../testMSChart-govdocs-428996.xls | Bin 0 -> 35328 bytes .../testMSChart-govdocs-428996.xlsx | Bin 0 -> 17112 bytes .../test-documents/testNumbersNew.numbers | Bin 0 -> 179147 bytes .../resources/test-documents/testODTNoMeta.odt | Bin 0 -> 5847 bytes .../test-documents/testPDF_bad_page_303226.pdf | Bin 0 -> 138027 bytes .../resources/test-documents/testPagesNew.pages | Bin 0 -> 237567 bytes .../test-documents/testRFC822_date_utf8 | 8 + .../resources/test-documents/testRFC822_eml | 33 + .../resources/test-documents/testSqlite3b.db | Bin 27648 -> 27648 bytes .../resources/test-documents/testWORD2003.xml | 2542 +++++++++ tika-serialization/pom.xml | 4 +- tika-server/pom.xml | 45 +- .../tika/server/resource/LanguageResource.java | 27 +- .../tika/server/resource/MetadataResource.java | 9 +- .../resource/RecursiveMetadataResource.java | 7 +- .../tika/server/resource/TranslateResource.java | 22 +- .../org/apache/tika/server/CXFTestBase.java | 26 +- tika-translate/pom.xml | 9 +- .../language/translate/AbstractTranslator.java | 32 + .../language/translate/CachedTranslator.java | 20 +- .../language/translate/ExternalTranslator.java | 13 +- .../language/translate/GoogleTranslator.java | 20 +- .../language/translate/Lingo24Translator.java | 20 +- .../language/translate/MosesTranslator.java | 7 +- .../language/translate/YandexTranslator.java | 175 + .../translate/translator.yandex.properties | 24 + .../translate/YandexTranslatorTest.java | 105 + tika-xmp/pom.xml | 2 +- 221 files changed, 13467 insertions(+), 5115 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/e780d566/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java ---------------------------------------------------------------------- diff --cc tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index caa916a,0e3acd9..896b51b --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@@ -35,12 -35,6 +37,8 @@@ import java.util.Map import java.util.Set; import java.util.concurrent.ExecutorService; - import javax.imageio.spi.ServiceRegistry; - import javax.xml.parsers.DocumentBuilder; - import javax.xml.parsers.DocumentBuilderFactory; - import javax.xml.parsers.ParserConfigurationException; - +import org.apache.tika.base.Configurable; ++ import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor; import org.apache.tika.concurrent.SimpleThreadPoolExecutor; import org.apache.tika.detect.CompositeDetector; http://git-wip-us.apache.org/repos/asf/tika/blob/e780d566/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java ---------------------------------------------------------------------- diff --cc tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 20607d9,2521cc9..e58f5c8 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@@ -43,11 -53,22 +53,27 @@@ public class ParseContext implements Se /** Map of objects in this context */ private final Map<String, Object> context = new HashMap<String, Object>(); + + /** + * Map of configurable arguments + */ + private final Map<String, String> params = new HashMap<>(); + + private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() { + public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { + return new InputSource(new StringReader("")); + } + }; + + private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER = + new XMLResolver() { + @Override + public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) throws + XMLStreamException { + return ""; + } + }; + /** * Adds the given value to the context as an implementation of the given * interface. @@@ -150,36 -196,111 +201,144 @@@ } /** + * Stores a key=value parameter + * @param key parameter name + * @param value value + */ + public void setParam(String key, String value){ + this.params.put(key, value); + } + + /** + * Gets the value associated with given parameter + * @param key parameter name + */ + public void getParam(String key){ + this.params.get(key); + } + + /** + * Gets all the params + * @return map of key values + */ + public Map<String, String> getParams() { + return params; + } + + /** + * Checks if parameter is available + * @param key parameter name + * @return true if parameter is available, false otherwise + */ + public boolean hasParam(String key){ + return params.containsKey(key); + } ++ /** + * Returns the DOM builder factory specified in this parsing context. + * If a factory is not explicitly specified, then a default factory + * instance is created and returned. The default factory instance is + * configured to be namespace-aware and to apply reasonable security + * features. + * + * @since Apache Tika 1.13 + * @return DOM parser factory + */ + private DocumentBuilderFactory getDocumentBuilderFactory() { + //borrowed from Apache POI + DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class); + if (documentBuilderFactory != null) { + return documentBuilderFactory; + } + documentBuilderFactory = DocumentBuilderFactory.newInstance(); + documentBuilderFactory.setNamespaceAware(true); + documentBuilderFactory.setValidating(false); + tryToSetSAXFeatureOnDOMFactory(documentBuilderFactory, + XMLConstants.FEATURE_SECURE_PROCESSING, true); + tryToSetXercesManager(documentBuilderFactory); + return documentBuilderFactory; + } + + /** + * Returns the DOM builder specified in this parsing context. + * If a builder is not explicitly specified, then a builder + * instance is created and returned. The builder instance is + * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER}, + * and it sets the ErrorHandler to <code>null</code>. + * + * @since Apache Tika 1.13 + * @return DOM Builder + */ + public DocumentBuilder getDocumentBuilder() throws TikaException { + DocumentBuilder documentBuilder = get(DocumentBuilder.class); + if (documentBuilder != null) { + return documentBuilder; + } + try { + DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory(); + documentBuilder = documentBuilderFactory.newDocumentBuilder(); + documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER); + documentBuilder.setErrorHandler(null); + return documentBuilder; + } catch (ParserConfigurationException e) { + throw new TikaException("XML parser not available", e); + } + } + + /** + * Returns the StAX input factory specified in this parsing context. + * If a factory is not explicitly specified, then a default factory + * instance is created and returned. The default factory instance is + * configured to be namespace-aware and to apply reasonable security + * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}. + * + * @since Apache Tika 1.13 + * @return StAX input factory + */ + public XMLInputFactory getXMLInputFactory() { + XMLInputFactory factory = get(XMLInputFactory.class); + if (factory != null) { + return factory; + } + factory = XMLInputFactory.newFactory(); + + tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true); + tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false); + + factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER); + return factory; + } + + private static void tryToSetSAXFeatureOnDOMFactory(DocumentBuilderFactory dbf, String feature, boolean value) { + try { + dbf.setFeature(feature, value); + } catch (Exception|AbstractMethodError e) { + } + } + + private static void tryToSetXercesManager(DocumentBuilderFactory dbf) { + // Try built-in JVM one first, standalone if not + for (String securityManagerClassName : new String[] { + "com.sun.org.apache.xerces.internal.util.SecurityManager", + "org.apache.xerces.util.SecurityManager" + }) { + try { + Object mgr = Class.forName(securityManagerClassName).newInstance(); + Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); + setLimit.invoke(mgr, 4096); + dbf.setAttribute("http://apache.org/xml/properties/security-manager", mgr); + // Stop once one can be setup without error + return; + } catch (Throwable t) { + } + } + } + + private void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) { + try { + factory.setProperty(key, value); + } catch (IllegalArgumentException e) { + //swallow + } + } + }
