This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4578 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 08df7e7423b92bffecaad177995563aed2ca8e0f Author: Nicholas DiPiazza <[email protected]> AuthorDate: Tue Dec 16 13:50:37 2025 -0600 TIKA-4578: Add Docker build configuration for tika-grpc - Add Dockerfile with Ubuntu base, Java 17, OCR, and font support - Add docker-build.sh script to build and optionally push images - Add start-tika-grpc.sh entrypoint script - Include all tika-pipes plugins (fetchers, emitters, iterators) - Include parser packages (standard, extended, ML, scientific, sqlite3, NLP) - Add README with usage instructions and examples - Support multi-arch builds and multiple registries (Docker Hub, ECR, ACR) --- tika-grpc/docker-build/Dockerfile | 39 +++++++++ tika-grpc/docker-build/README.md | 128 ++++++++++++++++++++++++++++++ tika-grpc/docker-build/docker-build.sh | 113 ++++++++++++++++++++++++++ tika-grpc/docker-build/start-tika-grpc.sh | 29 +++++++ 4 files changed, 309 insertions(+) diff --git a/tika-grpc/docker-build/Dockerfile b/tika-grpc/docker-build/Dockerfile new file mode 100644 index 000000000..9164ee7b2 --- /dev/null +++ b/tika-grpc/docker-build/Dockerfile @@ -0,0 +1,39 @@ +FROM ubuntu:latest +COPY libs/ /tika/libs/ +COPY plugins/ /tika/plugins/ +COPY config/ /tika/config/ +COPY bin/ /tika/bin +ARG JRE='openjdk-17-jre-headless' +ARG VERSION='4.0.0-SNAPSHOT' +ARG TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=104857600 +ARG TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=104857600 +ARG TIKA_GRPC_NUM_THREADS=4 +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +EXPOSE 9090 +ENV TIKA_VERSION=$VERSION +ENV TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE +ENV TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE +ENV TIKA_GRPC_NUM_THREADS=$TIKA_GRPC_NUM_THREADS +RUN chmod +x "/tika/bin/start-tika-grpc.sh" +ENTRYPOINT ["/tika/bin/start-tika-grpc.sh"] diff --git a/tika-grpc/docker-build/README.md b/tika-grpc/docker-build/README.md new file mode 100644 index 000000000..7bf2ad5ee --- /dev/null +++ b/tika-grpc/docker-build/README.md @@ -0,0 +1,128 @@ +# Tika gRPC Docker Build + +This directory contains the Docker build configuration for Apache Tika gRPC server. + +## Overview + +The Docker image includes: +- Tika gRPC server JAR +- All Tika Pipes plugins (fetchers, emitters, iterators) +- Parser packages (standard, extended, ML) +- OCR support (Tesseract with multiple languages) +- GDAL for geospatial formats +- Common fonts + +## Building the Docker Image + +### Prerequisites + +1. Build Tika from the project root: +```bash +mvn clean install -DskipTests +``` + +2. Set the required environment variable: +```bash +export TIKA_VERSION=4.0.0-SNAPSHOT +``` + +### Run the Docker Build Script + +From the project root directory: + +```bash +./tika-grpc/docker-build/docker-build.sh +``` + +### Optional Environment Variables + +- `TIKA_VERSION`: Maven project version (required) +- `RELEASE_IMAGE_TAG`: Override the default tag (defaults to TIKA_VERSION without -SNAPSHOT) +- `DOCKER_ID`: Docker Hub username to push to Docker Hub +- `AWS_ACCOUNT_ID`: AWS account ID to push to ECR +- `AWS_REGION`: AWS region for ECR (default: us-west-2) +- `AZURE_REGISTRY_NAME`: Azure Container Registry name +- `MULTI_ARCH`: Build for multiple architectures (default: false) +- `PROJECT_NAME`: Docker image name (default: tika-grpc) + +### Examples + +Build and tag for Docker Hub: +```bash +export TIKA_VERSION=4.0.0-SNAPSHOT +export DOCKER_ID=myusername +./tika-grpc/docker-build/docker-build.sh +docker push myusername/tika-grpc:4.0.0 +``` + +Build and push to AWS ECR: +```bash +export TIKA_VERSION=4.0.0-SNAPSHOT +export AWS_ACCOUNT_ID=123456789012 +export AWS_REGION=us-east-1 +./tika-grpc/docker-build/docker-build.sh +``` + +Build multi-architecture image: +```bash +export TIKA_VERSION=4.0.0-SNAPSHOT +export DOCKER_ID=myusername +export MULTI_ARCH=true +./tika-grpc/docker-build/docker-build.sh +``` + +## Running the Docker Container + +```bash +docker run -p 9090:9090 tika-grpc:4.0.0 +``` + +### Environment Variables + +- `TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE`: Maximum inbound message size (default: 104857600) +- `TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE`: Maximum outbound message size (default: 104857600) +- `TIKA_GRPC_NUM_THREADS`: Number of gRPC server threads (default: 4) + +### Example with Custom Settings + +```bash +docker run -p 9090:9090 \ + -e TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=209715200 \ + -e TIKA_GRPC_NUM_THREADS=8 \ + tika-grpc:4.0.0 +``` + +## Included Plugins + +The Docker image includes all available Tika Pipes plugins: + +### Fetchers/Emitters +- tika-pipes-file-system +- tika-pipes-http +- tika-pipes-s3 +- tika-pipes-az-blob +- tika-pipes-gcs +- tika-pipes-jdbc +- tika-pipes-kafka +- tika-pipes-microsoft-graph +- tika-pipes-solr +- tika-pipes-opensearch +- tika-pipes-json +- tika-pipes-csv + +### Parser Packages +- tika-parsers-standard-package (included in base JAR) +- tika-parser-scientific-package +- tika-parser-sqlite3-package +- tika-parser-nlp-package + +## Tesseract OCR Languages + +The following Tesseract language packs are pre-installed: +- English (eng) +- Italian (ita) +- French (fra) +- Spanish (spa) +- German (deu) + +Additional languages can be added by modifying the Dockerfile. diff --git a/tika-grpc/docker-build/docker-build.sh b/tika-grpc/docker-build/docker-build.sh new file mode 100755 index 000000000..e95c470ae --- /dev/null +++ b/tika-grpc/docker-build/docker-build.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# This script is intended to be run from Maven exec plugin during the package phase of maven build + +if [ -z "${TIKA_VERSION}" ]; then + echo "Environment variable TIKA_VERSION is required, and should match the maven project version of Tika" + exit 1 +fi + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +cd "${SCRIPT_DIR}/../../" || exit + +OUT_DIR=target/tika-docker + +MULTI_ARCH=${MULTI_ARCH:-false} +AWS_REGION=${AWS_REGION:-us-west-2} +AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-} +AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-} +DOCKER_ID=${DOCKER_ID:-} +PROJECT_NAME=${PROJECT_NAME:-tika-grpc} + +# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION +if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then + RELEASE_IMAGE_TAG="${TIKA_VERSION}" + ## Remove '-SNAPSHOT' from the version string + RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}" +fi + +mkdir -p "${OUT_DIR}/libs" +mkdir -p "${OUT_DIR}/plugins" +mkdir -p "${OUT_DIR}/config" +mkdir -p "${OUT_DIR}/bin" +cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs" + +# Copy all tika-pipes plugin zip files +for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp -v -r "$zip_file" "${OUT_DIR}/plugins" + else + echo "Plugin file $zip_file does not exist, skipping." + fi +done + +# Copy parser package jars as plugins +parser_packages=( + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package" +) + +for parser_package in "${parser_packages[@]}"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp -v -r "$jar_file" "${OUT_DIR}/plugins" + else + echo "Parser package file $jar_file does not exist, skipping." + fi +done + +cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin" + +cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + +cd "${OUT_DIR}" || exit + +echo "Running docker build from directory: $(pwd)" + +IMAGE_TAGS=() +if [[ -n "${AWS_ACCOUNT_ID}" ]]; then + aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com + IMAGE_TAGS+=("-t ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}") +fi + +if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then + az acr login --name ${AZURE_REGISTRY_NAME} + IMAGE_TAGS+=("-t ${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}") +fi + +if [[ -n "${DOCKER_ID}" ]]; then + IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}") +fi + +if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then + echo "No image tags specified, skipping Docker build step. To enable build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment variables." + exit 0 +fi + +tag="${IMAGE_TAGS[*]}" +if [ "${MULTI_ARCH}" == "true" ]; then + echo "Building multi arch image" + docker buildx create --name tikabuilder + # see https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147 + docker run --rm --privileged tonistiigi/binfmt --install amd64 + docker run --rm --privileged tonistiigi/binfmt --install arm64 + docker buildx build \ + --builder=tikabuilder . \ + ${tag} \ + --platform linux/amd64,linux/arm64 \ + --push + docker buildx stop tikabuilder +else + echo "Building single arch image" + # build single arch + docker build . ${tag} +fi + +echo " ===================================================================================================" +echo " Done running docker build with tag ${tag}" +echo " ===================================================================================================" diff --git a/tika-grpc/docker-build/start-tika-grpc.sh b/tika-grpc/docker-build/start-tika-grpc.sh new file mode 100755 index 000000000..ae8e378b7 --- /dev/null +++ b/tika-grpc/docker-build/start-tika-grpc.sh @@ -0,0 +1,29 @@ +#!/bin/sh +echo "Tika Version:" +echo "${TIKA_VERSION}" +echo "Tika Plugins:" +ls "/tika/plugins" +echo "Tika gRPC Max Inbound Message Size:" +echo "${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" +echo "Tika gRPC Max Outbound Message Size:" +echo "${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" +echo "Tika gRPC Num Threads:" +echo "${TIKA_GRPC_NUM_THREADS}" +exec java \ + -Dgrpc.server.port=9090 \ + "-Dgrpc.server.max-inbound-message-size=${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" \ + "-Dgrpc.server.max-outbound-message-size=${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" \ + "-Dgrpc.server.numThreads=${TIKA_GRPC_NUM_THREADS}" \ + --add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \ + --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens=java.management/com.sun.jmx.mbeanserver=ALL-UNNAMED \ + --add-opens=jdk.internal.jvmstat/sun.jvmstat.monitor=ALL-UNNAMED \ + --add-opens=java.base/sun.reflect.generics.reflectiveObjects=ALL-UNNAMED \ + --add-opens=java.base/java.io=ALL-UNNAMED \ + --add-opens=java.base/java.nio=ALL-UNNAMED \ + --add-opens=java.base/java.util=ALL-UNNAMED \ + --add-opens=java.base/java.lang=ALL-UNNAMED \ + -Djava.net.preferIPv4Stack=true \ + "-Dplugins.pluginDirs=/tika/plugins" \ + -jar "/tika/libs/tika-grpc-${TIKA_VERSION}.jar"
