This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch chardet-work
in repository https://gitbox.apache.org/repos/asf/tika.git
from dbb4d19222 fix ebcdic test
add e5151b1e5b TIKA-4327: update google-api
add 2fd8c0eda2 TIKA-4327: update microsoft-graph.version, maven.bundle,
aws, google cloud, junrar, mockito, error_prone_annotations
add 9451da2d56 TIKA-4606: Upgrade Apache Ignite from 2.x to 3.x (fresh)
(#2654)
add 2a9957a12b Bump org.tukaani:xz from 1.11 to 1.12 (#2670)
add 0385b58466 Bump io.swagger.core.v3:swagger-annotations from 2.2.38 to
2.2.43 (#2669)
add 1d46c8b97f Bump org.jetbrains.kotlin:kotlin-stdlib from 2.2.0 to
2.3.10 (#2663)
add aaef3ca7a3 Bump info.picocli:picocli from 4.7.5 to 4.7.7 (#2661)
add b9903d0840 Bump org.jetbrains:annotations from 26.0.2-1 to 26.1.0
(#2659)
add 4ba11a4e19 Bump org.yaml:snakeyaml from 2.4 to 2.6 (#2671)
add 4c9017fca3 Bump jakarta.inject:jakarta.inject-api from 2.0.1 to
2.0.1.MR (#2667)
add 93b5cfa96f TIKA-4488: update micronaut
add d139bfe02e TIKA-4488: add micronaut version
add a7116b05d9 TIKA-4488: add micronaut version
add 27933e64b9 TIKA-4327: add comment
add 9f94799669 TIKA-4327: update tyrus, kiota, solrj, spotless-maven-plugin
add 30e46db4fa TIKA-4606: Add e2e tests for Ignite 3.x upgrade (#2655)
add ca67465e90 TIKA-4327: update aws, swagger, jackrabbit; add comment on
solrj 10 migration
add fdac94fc18 TIKA-4682 4x tweaks (#2674)
add bbcb82d5a6 Merge remote-tracking branch 'origin/main' into chardet-work
add 5b32cdf660 chardet - fix IBM855/IBM866 model placement, add docs and
EBCDIC routing test
add ade0611dd5 chardet - wip
add bb9f585def chardet - wip
add 599a0427a4 TIKA-4327: update aws, zookeeper, shade plugin, azure
add 2600f092b6 Merge remote-tracking branch 'origin/main' into chardet-work
No new revisions were added by this update.
Summary of changes:
.github/workflows/main-jdk17-build.yml | 19 +
.../main-jdk17-windows-build-multi-locale.yml | 2 +-
.github/workflows/main-jdk17-windows-build.yml | 2 +-
.java-version | 18 -
docs/modules/ROOT/nav.adoc | 1 +
.../pages/advanced/charset-detection-design.adoc | 543 ++++++-----
pom.xml | 6 +
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 6 +-
tika-e2e-tests/README.md | 12 +-
tika-e2e-tests/pom.xml | 66 +-
tika-e2e-tests/tika-grpc/README.md | 100 +-
tika-e2e-tests/tika-grpc/pom.xml | 52 +-
.../tika/parser/ocr/TesseractOCRConfig.properties | 25 -
.../customocr/tika-config-inline.json | 25 -
.../customocr/tika-config-inline.xml | 49 -
.../customocr/tika-config-rendered.json | 27 -
.../customocr/tika-config-rendered.xml | 55 --
.../tika/parser/journal/GrobidExtractor.properties | 16 -
.../sample-configs/grobid/tika-config.json | 22 -
.../sample-configs/grobid/tika-config.xml | 41 -
.../tika-grpc/sample-configs/ignite/README.md | 117 ---
.../sample-configs/ignite/tika-config-ignite.json | 2 +-
.../sample-configs/ner/run_tika_server.sh | 62 --
.../tika-grpc/sample-configs/ner/tika-config.json | 26 -
.../tika-grpc/sample-configs/ner/tika-config.xml | 45 -
.../tika-grpc/sample-configs/test-simple.json | 20 -
.../vision/inception-rest-caption.json | 18 -
.../vision/inception-rest-caption.xml | 32 -
.../vision/inception-rest-video.json | 18 -
.../sample-configs/vision/inception-rest-video.xml | 32 -
.../sample-configs/vision/inception-rest.json | 18 -
.../sample-configs/vision/inception-rest.xml | 32 -
.../org/apache/tika/pipes/ExternalTestBase.java | 285 +++++-
.../pipes/filesystem/FileSystemFetcherTest.java | 79 +-
.../tika/pipes/ignite/IgniteConfigStoreTest.java | 679 ++++++++-----
.../java/org/apache/tika/pipes/ignite/README.md | 172 ----
.../src/test/resources/docker-compose-ignite.yml | 25 -
.../src/test/resources/docker-compose.yml | 16 -
.../tika-grpc/src/test/resources/log4j2.xml | 19 -
.../src/test/resources/test-fixtures/sample.csv | 4 +
.../src/test/resources/test-fixtures/sample.html | 8 +
.../src/test/resources/test-fixtures/sample.txt | 3 +
.../src/test/resources/test-fixtures/sample.xml | 5 +
...g-ignite.json => tika-config-ignite-local.json} | 4 +-
.../src/test/resources/tika-config-ignite.json | 2 +-
.../tika-grpc/src/test/resources/tika-config.json | 49 +-
.../charsoup/CharSoupEncodingDetector.java | 67 +-
.../tika/ml/chardetect/CharsetConfusables.java | 61 +-
.../ml/chardetect/MojibusterEncodingDetector.java | 287 ++----
.../tika/ml/chardetect/chardetect-ebcdic.bin | Bin 7312 -> 0 bytes
.../org/apache/tika/ml/chardetect/chardetect.bin | Bin 410106 -> 606934
bytes
.../tika/ml/chardetect/EbcdicRoutingTest.java | 78 +-
tika-eval/tika-eval-app/pom.xml | 25 +-
.../tika-eval-app}/src/main/assembly/assembly.xml | 4 -
tika-grpc/dev-tika-config.json | 3 +-
tika-grpc/pom.xml | 22 +-
tika-grpc/run-dev.sh | 15 +-
.../org/apache/tika/pipes/grpc/TikaGrpcServer.java | 7 +-
.../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 39 +-
tika-grpc/src/main/proto/tika.proto | 2 +
.../src/test/resources/tika-config-ignite.json | 2 +-
.../charsoup/CharSoupLanguageDetector.java | 6 +
tika-ml/tika-ml-chardetect/README.md | 266 ++++-
.../chardetect/tools/BuildCharsetTrainingData.java | 1016 ++++++++++++++++++++
.../ml/chardetect/tools/EvalCharsetDetectors.java | 4 +-
.../ml/chardetect/tools/TrainCharsetModel.java | 5 +-
.../src/test/python/build_charset_training.py | 855 ----------------
tika-parent/pom.xml | 123 ++-
.../org/apache/tika/parser/txt/TXTParserTest.java | 7 +-
.../tika/async/cli/FileListPipesIterator.java | 122 +++
.../org/apache/tika/async/cli/PluginsWriter.java | 118 ++-
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 102 +-
.../apache/tika/async/cli/AsyncCliParserTest.java | 44 +-
.../tika/async/cli/FileListPipesIteratorTest.java | 103 ++
tika-pipes/tika-pipes-config-store-ignite/pom.xml | 113 ++-
.../tika/pipes/ignite/ExtensionConfigDTO.java | 29 +-
.../tika/pipes/ignite/IgniteConfigStore.java | 182 ++--
.../ignite/config/IgniteConfigStoreConfig.java | 59 +-
.../pipes/ignite/server/IgniteStoreServer.java | 201 ++--
.../tika/pipes/ignite/IgniteConfigStoreTest.java | 119 +--
.../tika/pipes/core/async/AsyncProcessor.java | 33 +-
.../tika-pipes-google-drive/pom.xml | 4 +-
.../tika-pipes-microsoft-graph/pom.xml | 4 +-
.../apache/tika/server/core/TikaServerProcess.java | 29 +-
tika-translate/pom.xml | 2 +-
85 files changed, 3789 insertions(+), 3228 deletions(-)
delete mode 100644 .java-version
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.xml
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ignite/README.md
delete mode 100755
tika-e2e-tests/tika-grpc/sample-configs/ner/run_tika_server.sh
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.xml
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/test-simple.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-caption.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-caption.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-video.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-video.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest.xml
delete mode 100644
tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/ignite/README.md
delete mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/docker-compose-ignite.yml
delete mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/docker-compose.yml
delete mode 100644 tika-e2e-tests/tika-grpc/src/test/resources/log4j2.xml
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.csv
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.html
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.txt
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.xml
copy tika-e2e-tests/tika-grpc/src/test/resources/{tika-config-ignite.json =>
tika-config-ignite-local.json} (90%)
delete mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin
copy {tika-pipes/tika-pipes-fork-parser =>
tika-eval/tika-eval-app}/src/main/assembly/assembly.xml (92%)
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
delete mode 100644
tika-ml/tika-ml-chardetect/src/test/python/build_charset_training.py
create mode 100644
tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/FileListPipesIterator.java
create mode 100644
tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/FileListPipesIteratorTest.java