This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a commit to branch TIKA-4578
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 08df7e7423b92bffecaad177995563aed2ca8e0f
Author: Nicholas DiPiazza <[email protected]>
AuthorDate: Tue Dec 16 13:50:37 2025 -0600

    TIKA-4578: Add Docker build configuration for tika-grpc
    
    - Add Dockerfile with Ubuntu base, Java 17, OCR, and font support
    - Add docker-build.sh script to build and optionally push images
    - Add start-tika-grpc.sh entrypoint script
    - Include all tika-pipes plugins (fetchers, emitters, iterators)
    - Include parser packages (standard, extended, ML, scientific, sqlite3, NLP)
    - Add README with usage instructions and examples
    - Support multi-arch builds and multiple registries (Docker Hub, ECR, ACR)
---
 tika-grpc/docker-build/Dockerfile         |  39 +++++++++
 tika-grpc/docker-build/README.md          | 128 ++++++++++++++++++++++++++++++
 tika-grpc/docker-build/docker-build.sh    | 113 ++++++++++++++++++++++++++
 tika-grpc/docker-build/start-tika-grpc.sh |  29 +++++++
 4 files changed, 309 insertions(+)

diff --git a/tika-grpc/docker-build/Dockerfile 
b/tika-grpc/docker-build/Dockerfile
new file mode 100644
index 000000000..9164ee7b2
--- /dev/null
+++ b/tika-grpc/docker-build/Dockerfile
@@ -0,0 +1,39 @@
+FROM ubuntu:latest
+COPY libs/ /tika/libs/
+COPY plugins/ /tika/plugins/
+COPY config/ /tika/config/
+COPY bin/ /tika/bin
+ARG JRE='openjdk-17-jre-headless'
+ARG VERSION='4.0.0-SNAPSHOT'
+ARG TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=104857600
+ARG TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=104857600
+ARG TIKA_GRPC_NUM_THREADS=4
+RUN set -eux \
+    && apt-get update \
+    && apt-get install --yes --no-install-recommends gnupg2 
software-properties-common \
+    && DEBIAN_FRONTEND=noninteractive apt-get install --yes 
--no-install-recommends $JRE \
+        gdal-bin \
+        tesseract-ocr \
+        tesseract-ocr-eng \
+        tesseract-ocr-ita \
+        tesseract-ocr-fra \
+        tesseract-ocr-spa \
+        tesseract-ocr-deu \
+    && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula 
select true | debconf-set-selections \
+    && DEBIAN_FRONTEND=noninteractive apt-get install --yes 
--no-install-recommends \
+        xfonts-utils \
+        fonts-freefont-ttf \
+        fonts-liberation \
+        ttf-mscorefonts-installer \
+        wget \
+        cabextract \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+EXPOSE 9090
+ENV TIKA_VERSION=$VERSION
+ENV TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE
+ENV TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE
+ENV TIKA_GRPC_NUM_THREADS=$TIKA_GRPC_NUM_THREADS
+RUN chmod +x "/tika/bin/start-tika-grpc.sh"
+ENTRYPOINT ["/tika/bin/start-tika-grpc.sh"]
diff --git a/tika-grpc/docker-build/README.md b/tika-grpc/docker-build/README.md
new file mode 100644
index 000000000..7bf2ad5ee
--- /dev/null
+++ b/tika-grpc/docker-build/README.md
@@ -0,0 +1,128 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC 
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root:
+```bash
+mvn clean install -DskipTests
+```
+
+2. Set the required environment variable:
+```bash
+export TIKA_VERSION=4.0.0-SNAPSHOT
+```
+
+### Run the Docker Build Script
+
+From the project root directory:
+
+```bash
+./tika-grpc/docker-build/docker-build.sh
+```
+
+### Optional Environment Variables
+
+- `TIKA_VERSION`: Maven project version (required)
+- `RELEASE_IMAGE_TAG`: Override the default tag (defaults to TIKA_VERSION 
without -SNAPSHOT)
+- `DOCKER_ID`: Docker Hub username to push to Docker Hub
+- `AWS_ACCOUNT_ID`: AWS account ID to push to ECR
+- `AWS_REGION`: AWS region for ECR (default: us-west-2)
+- `AZURE_REGISTRY_NAME`: Azure Container Registry name
+- `MULTI_ARCH`: Build for multiple architectures (default: false)
+- `PROJECT_NAME`: Docker image name (default: tika-grpc)
+
+### Examples
+
+Build and tag for Docker Hub:
+```bash
+export TIKA_VERSION=4.0.0-SNAPSHOT
+export DOCKER_ID=myusername
+./tika-grpc/docker-build/docker-build.sh
+docker push myusername/tika-grpc:4.0.0
+```
+
+Build and push to AWS ECR:
+```bash
+export TIKA_VERSION=4.0.0-SNAPSHOT
+export AWS_ACCOUNT_ID=123456789012
+export AWS_REGION=us-east-1
+./tika-grpc/docker-build/docker-build.sh
+```
+
+Build multi-architecture image:
+```bash
+export TIKA_VERSION=4.0.0-SNAPSHOT
+export DOCKER_ID=myusername
+export MULTI_ARCH=true
+./tika-grpc/docker-build/docker-build.sh
+```
+
+## Running the Docker Container
+
+```bash
+docker run -p 9090:9090 tika-grpc:4.0.0
+```
+
+### Environment Variables
+
+- `TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE`: Maximum inbound message size (default: 
104857600)
+- `TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE`: Maximum outbound message size 
(default: 104857600)
+- `TIKA_GRPC_NUM_THREADS`: Number of gRPC server threads (default: 4)
+
+### Example with Custom Settings
+
+```bash
+docker run -p 9090:9090 \
+  -e TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=209715200 \
+  -e TIKA_GRPC_NUM_THREADS=8 \
+  tika-grpc:4.0.0
+```
+
+## Included Plugins
+
+The Docker image includes all available Tika Pipes plugins:
+
+### Fetchers/Emitters
+- tika-pipes-file-system
+- tika-pipes-http
+- tika-pipes-s3
+- tika-pipes-az-blob
+- tika-pipes-gcs
+- tika-pipes-jdbc
+- tika-pipes-kafka
+- tika-pipes-microsoft-graph
+- tika-pipes-solr
+- tika-pipes-opensearch
+- tika-pipes-json
+- tika-pipes-csv
+
+### Parser Packages
+- tika-parsers-standard-package (included in base JAR)
+- tika-parser-scientific-package
+- tika-parser-sqlite3-package
+- tika-parser-nlp-package
+
+## Tesseract OCR Languages
+
+The following Tesseract language packs are pre-installed:
+- English (eng)
+- Italian (ita)
+- French (fra)
+- Spanish (spa)
+- German (deu)
+
+Additional languages can be added by modifying the Dockerfile.
diff --git a/tika-grpc/docker-build/docker-build.sh 
b/tika-grpc/docker-build/docker-build.sh
new file mode 100755
index 000000000..e95c470ae
--- /dev/null
+++ b/tika-grpc/docker-build/docker-build.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+    IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+    echo "No image tags specified, skipping Docker build step. To enable 
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment 
variables."
+    exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+  echo "Building multi arch image"
+  docker buildx create --name tikabuilder
+  # see 
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+  docker run --rm --privileged tonistiigi/binfmt --install amd64
+  docker run --rm --privileged tonistiigi/binfmt --install arm64
+  docker buildx build \
+      --builder=tikabuilder . \
+      ${tag} \
+      --platform linux/amd64,linux/arm64 \
+      --push
+  docker buildx stop tikabuilder
+else
+  echo "Building single arch image"
+  # build single arch
+  docker build . ${tag}
+fi
+
+echo " 
==================================================================================================="
+echo " Done running docker build with tag ${tag}"
+echo " 
==================================================================================================="
diff --git a/tika-grpc/docker-build/start-tika-grpc.sh 
b/tika-grpc/docker-build/start-tika-grpc.sh
new file mode 100755
index 000000000..ae8e378b7
--- /dev/null
+++ b/tika-grpc/docker-build/start-tika-grpc.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+echo "Tika Version:"
+echo "${TIKA_VERSION}"
+echo "Tika Plugins:"
+ls "/tika/plugins"
+echo "Tika gRPC Max Inbound Message Size:"
+echo "${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}"
+echo "Tika gRPC Max Outbound Message Size:"
+echo "${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}"
+echo "Tika gRPC Num Threads:"
+echo "${TIKA_GRPC_NUM_THREADS}"
+exec java \
+  -Dgrpc.server.port=9090 \
+  
"-Dgrpc.server.max-inbound-message-size=${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" \
+  
"-Dgrpc.server.max-outbound-message-size=${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}"
 \
+  "-Dgrpc.server.numThreads=${TIKA_GRPC_NUM_THREADS}" \
+  --add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \
+  --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \
+  --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
+  --add-opens=java.management/com.sun.jmx.mbeanserver=ALL-UNNAMED \
+  --add-opens=jdk.internal.jvmstat/sun.jvmstat.monitor=ALL-UNNAMED \
+  --add-opens=java.base/sun.reflect.generics.reflectiveObjects=ALL-UNNAMED \
+  --add-opens=java.base/java.io=ALL-UNNAMED \
+  --add-opens=java.base/java.nio=ALL-UNNAMED \
+  --add-opens=java.base/java.util=ALL-UNNAMED \
+  --add-opens=java.base/java.lang=ALL-UNNAMED \
+  -Djava.net.preferIPv4Stack=true \
+  "-Dplugins.pluginDirs=/tika/plugins" \
+  -jar "/tika/libs/tika-grpc-${TIKA_VERSION}.jar"

Reply via email to