This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3347 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 90ab0ac87535bd0f4ca06007b214117b35147b45 Merge: f336c59 3ef48c2 Author: tallison <[email protected]> AuthorDate: Thu Apr 15 08:14:15 2021 -0400 Merge remote-tracking branch 'origin/main' into TIKA-3347 # Conflicts: # tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java .gitattributes | 1 + CHANGES.txt | 10 +- NOTICE.txt | 24 + .../src/main/java/org/apache/tika/cli/TikaCLI.java | 5 +- tika-core/pom.xml | 451 +- .../org/apache/tika/parser/mock/MockParser.java | 75 + .../mock/MockParserTest.java} | 29 +- .../tika/parser/multiple/MultipleParserTest.java | 15 +- .../resources/test-documents/mock_fakeload.xml | 29 + tika-eval/tika-eval-app/pom.xml | 9 +- .../apache/tika/eval/app/tools/LeipzigHelper.java | 4 +- .../src/main/resources/comparison-reports-pg.xml | 2 + .../src/main/resources/comparison-reports.xml | 29 + .../apache/tika/eval/app/SimpleComparerTest.java | 4 +- tika-eval/tika-eval-core/pom.xml | 5 +- .../eval/core/tokens/CommonTokenCountManager.java | 3 + .../src/main/resources/common_tokens/amh | 24373 +++++--- .../src/main/resources/common_tokens/asm | 13760 +++-- .../src/main/resources/common_tokens/azj | 30022 ---------- .../src/main/resources/common_tokens/ben-rom | 30022 ++++++++++ .../src/main/resources/common_tokens/bih | 903 + .../src/main/resources/common_tokens/ckb | 5668 +- .../src/main/resources/common_tokens/div | 59956 +++++++++---------- .../src/main/resources/common_tokens/ful | 9679 +++ .../src/main/resources/common_tokens/gla | 26778 +++++++++ .../src/main/resources/common_tokens/gom | 10605 ++++ .../src/main/resources/common_tokens/gug | 5528 ++ .../src/main/resources/common_tokens/hat | 6924 ++- .../src/main/resources/common_tokens/hau | 30022 ++++++++++ .../src/main/resources/common_tokens/hin-rom | 30022 ++++++++++ .../src/main/resources/common_tokens/ibo | 9827 +++ .../src/main/resources/common_tokens/khm | 30022 ++++++++++ .../src/main/resources/common_tokens/kin | 9513 ++- .../src/main/resources/common_tokens/knn | 5022 ++ .../src/main/resources/common_tokens/kur | 31363 +++++++++- .../src/main/resources/common_tokens/lao | 30022 ++++++++++ .../src/main/resources/common_tokens/lin | 5655 ++ .../src/main/resources/common_tokens/lug | 54373 +++++++++-------- .../src/main/resources/common_tokens/mhr | 8081 +-- .../src/main/resources/common_tokens/mya | 30022 ++++++++++ .../src/main/resources/common_tokens/mya-zaw | 30022 ++++++++++ .../src/main/resources/common_tokens/new | 1024 + .../src/main/resources/common_tokens/nso | 4993 ++ .../src/main/resources/common_tokens/ori | 9393 +-- .../src/main/resources/common_tokens/orm | 30022 ++++++++++ .../src/main/resources/common_tokens/plt | 2532 - .../src/main/resources/common_tokens/quz | 4441 ++ .../src/main/resources/common_tokens/roh | 13773 +++++ .../src/main/resources/common_tokens/snd | 9547 +-- .../src/main/resources/common_tokens/srd | 525 + .../src/main/resources/common_tokens/ssw | 277 + .../src/main/resources/common_tokens/sun | 20671 ------- .../src/main/resources/common_tokens/tam-rom | 30022 ++++++++++ .../src/main/resources/common_tokens/tel-rom | 30022 ++++++++++ .../src/main/resources/common_tokens/tsn | 13918 +++++ .../src/main/resources/common_tokens/tuk | 30609 +++++----- .../src/main/resources/common_tokens/uig | 30576 ++++++---- .../src/main/resources/common_tokens/urd-rom | 30022 ++++++++++ .../src/main/resources/common_tokens/wol | 4502 ++ .../src/main/resources/common_tokens/xho | 19058 +++--- .../src/main/resources/common_tokens/yid | 12049 ++-- .../src/main/resources/common_tokens/yor | 1957 + .../src/main/resources/common_tokens/zho-simp | 30022 ++++++++++ .../src/main/resources/common_tokens/zho-trad | 30022 ++++++++++ .../apache/tika/eval/core/langid/LangIdTest.java | 27 +- tika-langdetect/overview.html | 24 - tika-langdetect/pom.xml | 1 + tika-langdetect/tika-langdetect-commons/pom.xml | 1 + tika-langdetect/tika-langdetect-opennlp/.gitignore | 2 +- .../tika/langdetect/opennlp/OpenNLPDetector.java | 43 +- .../main/resources/opennlp-langdetect-20210413.bin | Bin 0 -> 23536171 bytes .../opennlp_langdetect_model_20190626.bin | Bin 11579707 -> 0 bytes .../langdetect/opennlp/OpenNLPDetectorTest.java | 11 +- .../pom.xml | 57 +- .../tika/langdetect/tika}/LanguageIdentifier.java | 12 +- .../tika/langdetect/tika}/LanguageProfile.java | 4 +- .../langdetect/tika}/LanguageProfilerBuilder.java | 4 +- .../tika/langdetect/tika}/ProfilingWriter.java | 4 +- .../tika/langdetect/tika/TikaLanguageDetector.java | 91 + ...rg.apache.tika.language.detect.LanguageDetector | 16 + .../org/apache/tika/langdetect/tika}/be.ngp | 0 .../org/apache/tika/langdetect/tika}/ca.ngp | 0 .../org/apache/tika/langdetect/tika}/da.ngp | 0 .../org/apache/tika/langdetect/tika}/de.ngp | 0 .../org/apache/tika/langdetect/tika}/el.ngp | 0 .../org/apache/tika/langdetect/tika}/en.ngp | 0 .../org/apache/tika/langdetect/tika}/eo.ngp | 0 .../org/apache/tika/langdetect/tika}/es.ngp | 0 .../org/apache/tika/langdetect/tika}/et.ngp | 0 .../org/apache/tika/langdetect/tika}/fa.ngp | 0 .../org/apache/tika/langdetect/tika}/fi.ngp | 0 .../org/apache/tika/langdetect/tika}/fr.ngp | 0 .../org/apache/tika/langdetect/tika}/gl.ngp | 0 .../org/apache/tika/langdetect/tika}/hu.ngp | 0 .../org/apache/tika/langdetect/tika}/is.ngp | 0 .../org/apache/tika/langdetect/tika}/it.ngp | 0 .../org/apache/tika/langdetect/tika}/lt.ngp | 0 .../org/apache/tika/langdetect/tika}/nl.ngp | 0 .../org/apache/tika/langdetect/tika}/no.ngp | 0 .../org/apache/tika/langdetect/tika}/pl.ngp | 0 .../org/apache/tika/langdetect/tika}/pt.ngp | 0 .../org/apache/tika/langdetect/tika}/ro.ngp | 0 .../org/apache/tika/langdetect/tika}/ru.ngp | 0 .../org/apache/tika/langdetect/tika}/sk.ngp | 0 .../org/apache/tika/langdetect/tika}/sl.ngp | 0 .../org/apache/tika/langdetect/tika}/sv.ngp | 0 .../org/apache/tika/langdetect/tika}/th.ngp | 0 .../tika/langdetect/tika}/tika.language.properties | 0 .../org/apache/tika/langdetect/tika}/uk.ngp | 0 .../langdetect/tika}/LanguageIdentifierTest.java | 3 +- .../tika/langdetect/tika}/LanguageProfileTest.java | 3 +- .../tika}/LanguageProfilerBuilderTest.java | 65 +- .../tika/langdetect/tika}/ProfilingHandler.java | 4 +- .../tika/langdetect/tika}/ProfilingWriterTest.java | 2 +- .../org/apache/tika/langdetect/tika}/da.test | 0 .../org/apache/tika/langdetect/tika}/de.test | 0 .../org/apache/tika/langdetect/tika}/el.test | 0 .../org/apache/tika/langdetect/tika}/en.test | 0 .../org/apache/tika/langdetect/tika}/es.test | 0 .../org/apache/tika/langdetect/tika}/et.test | 0 .../org/apache/tika/langdetect/tika}/fi.test | 0 .../org/apache/tika/langdetect/tika}/fr.test | 0 .../org/apache/tika/langdetect/tika}/it.test | 0 .../langdetect/tika}/langbuilder/welsh_corpus.txt | 0 .../org/apache/tika/langdetect/tika}/lt.test | 0 .../org/apache/tika/langdetect/tika}/nl.test | 0 .../org/apache/tika/langdetect/tika}/pt.test | 0 .../org/apache/tika/langdetect/tika}/sv.test | 0 tika-parent/checkstyle.xml | 2 +- tika-parent/pom.xml | 1 + .../fetchiterator/jdbc/TestJDBCFetchIterator.java | 11 +- .../tika/server/classic/TikaResourceTest.java | 119 +- .../apache/tika/server/core/TikaServerProcess.java | 2 +- .../core/resource/RecursiveMetadataResource.java | 4 +- .../tika/server/core/resource/TikaResource.java | 150 +- .../server/core/writer/JSONMessageBodyWriter.java | 10 +- .../tika/server/core/writer/JSONObjWriter.java | 8 +- .../core/writer/MetadataListMessageBodyWriter.java | 6 +- .../org/apache/tika/server/core/CXFTestBase.java | 15 +- .../server/core/RecursiveMetadataResourceTest.java | 77 + .../apache/tika/server/core/StackTraceOffTest.java | 12 +- .../apache/tika/server/core/StackTraceTest.java | 12 +- .../core/TikaResourceMetadataFilterTest.java | 82 + ...ourceTest.java => TikaResourceNoStackTest.java} | 82 +- .../apache/tika/server/core/TikaResourceTest.java | 87 + .../resources/configs/metadata-filter-include.xml | 30 + .../test-documents/mock/hello_world_long.xml | 30 + 147 files changed, 701855 insertions(+), 188115 deletions(-)
