Copilot commented on code in PR #2462:
URL: https://github.com/apache/tika/pull/2462#discussion_r2624780551
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+ "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+ "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+ package_name=$(basename "$parser_package")
+ jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+ if [ -f "$jar_file" ]; then
+ cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+ else
+ echo "Parser package file $jar_file does not exist, skipping."
+ fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+ aws ecr get-login-password --region ${AWS_REGION} | docker login
--username AWS --password-stdin
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+ IMAGE_TAGS+=("-t
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+ az acr login --name ${AZURE_REGISTRY_NAME}
+ IMAGE_TAGS+=("-t
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+ IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+ echo "No image tags specified, skipping Docker build step. To enable
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment
variables."
+ exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+ echo "Building multi arch image"
+ docker buildx create --name tikabuilder
+ # see
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+ docker run --rm --privileged tonistiigi/binfmt --install amd64
+ docker run --rm --privileged tonistiigi/binfmt --install arm64
+ docker buildx build \
+ --builder=tikabuilder . \
+ ${tag} \
+ --platform linux/amd64,linux/arm64 \
+ --push
+ docker buildx stop tikabuilder
Review Comment:
The buildx builder is stopped but not removed after use. This will leave the
builder instance around, potentially causing issues on subsequent runs. Add
'docker buildx rm tikabuilder' after the stop command to clean up properly.
```suggestion
docker buildx stop tikabuilder
docker buildx rm tikabuilder
```
##########
tika-grpc/docker-build/start-tika-grpc.sh:
##########
@@ -0,0 +1,29 @@
+#!/bin/sh
Review Comment:
The shebang uses '/bin/sh' but the script may use bash-specific features.
While this script appears to be POSIX-compliant, for consistency with
docker-build.sh which uses '#!/bin/bash', consider using the same shebang here
or ensuring the script remains strictly POSIX-compliant.
```suggestion
#!/bin/bash
```
##########
tika-grpc/pom.xml:
##########
@@ -38,7 +38,44 @@
<asarkar-grpc-test.version>2.0.0</asarkar-grpc-test.version>
<awaitility.version>4.3.0</awaitility.version>
<j2objc-annotations.version>3.1</j2objc-annotations.version>
+ <skip.docker.build>true</skip.docker.build>
</properties>
+
+ <profiles>
+ <profile>
+ <id>enable-docker-build-aws</id>
+ <activation>
+ <property>
+ <name>env.AWS_ACCOUNT_ID</name>
+ </property>
+ </activation>
+ <properties>
+ <skip.docker.build>false</skip.docker.build>
+ </properties>
+ </profile>
+ <profile>
+ <id>enable-docker-build-azure</id>
+ <activation>
+ <property>
+ <name>env.AZURE_REGISTRY_NAME</name>
+ </property>
+ </activation>
+ <properties>
+ <skip.docker.build>false</skip.docker.build>
+ </properties>
+ </profile>
+ <profile>
+ <id>enable-docker-build-dockerhub</id>
+ <activation>
+ <property>
+ <name>env.DOCKER_ID</name>
+ </property>
+ </activation>
+ <properties>
+ <skip.docker.build>false</skip.docker.build>
+ </properties>
+ </profile>
Review Comment:
The Maven profiles are activated based on environment variable presence, but
the documentation suggests passing -Dskip.docker.build=false on the command
line. This creates two different activation mechanisms which could be
confusing. If env.DOCKER_ID is set, the profile activates and sets
skip.docker.build=false, but the documentation also shows using
-Dskip.docker.build=false explicitly. Consider documenting that either the
environment variables OR the property can be used, and clarify the precedence
between them.
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+ "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+ "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+ package_name=$(basename "$parser_package")
+ jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+ if [ -f "$jar_file" ]; then
+ cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+ else
+ echo "Parser package file $jar_file does not exist, skipping."
+ fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+ aws ecr get-login-password --region ${AWS_REGION} | docker login
--username AWS --password-stdin
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+ IMAGE_TAGS+=("-t
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+ az acr login --name ${AZURE_REGISTRY_NAME}
+ IMAGE_TAGS+=("-t
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
Review Comment:
The AWS CLI and Azure CLI login commands are missing error handling. If the
login fails (e.g., credentials are not configured), the script will continue
and the subsequent docker build command will fail with a potentially confusing
error. Add error checking after each login command to exit if authentication
fails.
##########
tika-grpc/docker-build/README.md:
##########
@@ -0,0 +1,170 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root (this builds all modules including
plugins):
+```bash
+mvn clean install -DskipTests
+```
+
+### Option 1: Run Docker Build During Maven Package
+
+The Docker build can be triggered automatically during the Maven package phase:
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
+```
+
+**Note:** By default, `skip.docker.build=true` to avoid running Docker builds
during normal development.
+
+#### Controlling Docker Build with Environment Variables
+
+All docker-build.sh environment variables are passed through from your shell:
+
+```bash
+# Build and push to Docker Hub
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+ mvn clean package -Dskip.docker.build=false
+```
Review Comment:
The example shows running 'mvn package' but the script copies files from
'tika-pipes/tika-pipes-plugins/*' and 'tika-parsers/*' which are sibling
directories to tika-grpc. These files won't exist unless a full build from the
root is performed. The example should clarify that users need to run 'mvn clean
install -DskipTests' from the project root first, or update the example to show
the command being run from the root with '-f tika-grpc' as shown in the PR
description.
##########
tika-grpc/README.md:
##########
@@ -11,3 +11,59 @@ This server will manage a pool of Tika Pipes clients.
* Delete
* Fetch + Parse a given Fetch Item
+## Building
+
+### Standard Build
+
+```bash
+mvn clean install -DskipTests
+```
+
+### Building Docker Image
+
+The tika-grpc module includes Docker build support. See
[docker-build/README.md](docker-build/README.md) for complete documentation.
+
+#### Quick Start - Docker Build
+
+**Option 1: Build with Maven (recommended)**
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
Review Comment:
The example on line 37 shows running 'mvn clean install -DskipTests
-Dskip.docker.build=false' from the project root, but this will build ALL Tika
modules with Docker builds enabled. This contradicts the stated purpose of just
building the tika-grpc module. The example should either add '-pl tika-grpc
-am' to only build tika-grpc and its dependencies, or clarify that this builds
the entire project.
```suggestion
mvn clean install -DskipTests -Dskip.docker.build=false -pl tika-grpc -am
```
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+ "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+ "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+ package_name=$(basename "$parser_package")
+ jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+ if [ -f "$jar_file" ]; then
+ cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+ else
+ echo "Parser package file $jar_file does not exist, skipping."
+ fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+ aws ecr get-login-password --region ${AWS_REGION} | docker login
--username AWS --password-stdin
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+ IMAGE_TAGS+=("-t
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+ az acr login --name ${AZURE_REGISTRY_NAME}
+ IMAGE_TAGS+=("-t
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+ IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+ echo "No image tags specified, skipping Docker build step. To enable
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment
variables."
+ exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+ echo "Building multi arch image"
+ docker buildx create --name tikabuilder
+ # see
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+ docker run --rm --privileged tonistiigi/binfmt --install amd64
+ docker run --rm --privileged tonistiigi/binfmt --install arm64
+ docker buildx build \
+ --builder=tikabuilder . \
+ ${tag} \
+ --platform linux/amd64,linux/arm64 \
+ --push
+ docker buildx stop tikabuilder
+else
+ echo "Building single arch image"
+ # build single arch
+ docker build . ${tag}
Review Comment:
The variable expansion on line 92 using "${IMAGE_TAGS[*]}" will concatenate
all tags into a single space-separated string, which works for the docker build
command on line 108. However, this is fragile - if any tag contains spaces, it
will break. While the current usage appears correct, consider using a loop to
build the docker command with proper quoting for better robustness, or add
validation to ensure tags don't contain spaces.
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+ "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+ "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+ package_name=$(basename "$parser_package")
+ jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+ if [ -f "$jar_file" ]; then
+ cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+ else
+ echo "Parser package file $jar_file does not exist, skipping."
+ fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+ aws ecr get-login-password --region ${AWS_REGION} | docker login
--username AWS --password-stdin
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+ IMAGE_TAGS+=("-t
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+ az acr login --name ${AZURE_REGISTRY_NAME}
+ IMAGE_TAGS+=("-t
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+ IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
Review Comment:
Variables used in shell commands should be quoted to prevent word splitting
and glob expansion. The variables AWS_REGION, AWS_ACCOUNT_ID,
AZURE_REGISTRY_NAME, DOCKER_ID, PROJECT_NAME, and RELEASE_IMAGE_TAG should be
wrapped in double quotes throughout the script. For example, line 74 should use
"${AWS_REGION}" instead of ${AWS_REGION}.
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
Review Comment:
The script copies plugin files from 'tika-pipes/tika-pipes-plugins/*/' but
if a plugin doesn't exist or hasn't been built, it silently skips it with just
an echo message. This could lead to missing plugins in the Docker image without
a clear indication that something went wrong. Consider making the build fail or
at least provide a warning if expected plugins are missing, especially for
critical plugins.
```suggestion
echo "WARNING: Plugin file $zip_file does not exist, skipping."
fi
done
# List of required (critical) plugins - add plugin directory names (not full
path)
REQUIRED_PLUGINS=(
# Add critical plugin names here, e.g.:
# "tika-pipes-example-plugin"
)
# Check for required plugin zip files
for plugin_name in "${REQUIRED_PLUGINS[@]}"; do
zip_file="tika-pipes/tika-pipes-plugins/${plugin_name}/target/${plugin_name}-${TIKA_VERSION}.zip"
if [ ! -f "$zip_file" ]; then
echo "ERROR: Required plugin file $zip_file is missing. Failing
build."
exit 1
fi
done
```
##########
tika-grpc/docker-build/README.md:
##########
@@ -0,0 +1,170 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root (this builds all modules including
plugins):
+```bash
+mvn clean install -DskipTests
+```
+
+### Option 1: Run Docker Build During Maven Package
+
+The Docker build can be triggered automatically during the Maven package phase:
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
+```
+
+**Note:** By default, `skip.docker.build=true` to avoid running Docker builds
during normal development.
+
+#### Controlling Docker Build with Environment Variables
+
+All docker-build.sh environment variables are passed through from your shell:
+
+```bash
+# Build and push to Docker Hub
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+ mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build multi-arch and push to Docker Hub
+MULTI_ARCH=true DOCKER_ID=myusername PROJECT_NAME=tika-grpc \
+ mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to AWS ECR
+AWS_ACCOUNT_ID=123456789012 AWS_REGION=us-east-1 \
+ mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to Azure Container Registry
+AZURE_REGISTRY_NAME=myregistry \
+ mvn clean package -Dskip.docker.build=false
+```
+
+### Option 2: Run the Docker Build Script Manually
+
+Set the required environment variable and run the script:
+
+```bash
+export TIKA_VERSION=4.0.0-SNAPSHOT
+./tika-grpc/docker-build/docker-build.sh
+```
+
+### Optional Environment Variables
+
+- `TIKA_VERSION`: Maven project version (required)
+- `RELEASE_IMAGE_TAG`: Override the default tag (defaults to TIKA_VERSION
without -SNAPSHOT)
+- `DOCKER_ID`: Docker Hub username to push to Docker Hub
+- `AWS_ACCOUNT_ID`: AWS account ID to push to ECR
+- `AWS_REGION`: AWS region for ECR (default: us-west-2)
+- `AZURE_REGISTRY_NAME`: Azure Container Registry name
+- `MULTI_ARCH`: Build for multiple architectures (default: false)
+- `PROJECT_NAME`: Docker image name (default: tika-grpc)
+
+### Examples
+
+**Maven build with Docker Hub (recommended):**
+```bash
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+ mvn clean package -Dskip.docker.build=false
+```
+
+**Maven build with multi-arch:**
+```bash
+MULTI_ARCH=true DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc \
+ mvn clean package -Dskip.docker.build=false
+```
+
+**Maven build with AWS ECR:**
+```bash
+AWS_ACCOUNT_ID=123456789012 AWS_REGION=us-east-1 \
+ mvn clean package -Dskip.docker.build=false
+```
Review Comment:
The README examples use 'mvn clean package' but 'clean' is unnecessary when
running just the package phase and could slow down the build. Either remove
'clean' from these examples or explain why it's needed. The examples on lines
92, 98, and 104 should be consistent with the approach shown on line 30.
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
Review Comment:
The script does not check if Docker is installed before attempting to run
docker commands. If Docker is not available, the script will fail with a
potentially unclear error message. Consider adding a check at the beginning of
the script to verify Docker is installed and provide a clear error message if
it's not.
##########
tika-grpc/docker-build/README.md:
##########
@@ -0,0 +1,170 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root (this builds all modules including
plugins):
+```bash
+mvn clean install -DskipTests
+```
+
+### Option 1: Run Docker Build During Maven Package
+
+The Docker build can be triggered automatically during the Maven package phase:
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
+```
+
+**Note:** By default, `skip.docker.build=true` to avoid running Docker builds
during normal development.
+
+#### Controlling Docker Build with Environment Variables
+
+All docker-build.sh environment variables are passed through from your shell:
+
+```bash
+# Build and push to Docker Hub
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+ mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build multi-arch and push to Docker Hub
+MULTI_ARCH=true DOCKER_ID=myusername PROJECT_NAME=tika-grpc \
+ mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to AWS ECR
+AWS_ACCOUNT_ID=123456789012 AWS_REGION=us-east-1 \
+ mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to Azure Container Registry
+AZURE_REGISTRY_NAME=myregistry \
+ mvn clean package -Dskip.docker.build=false
+```
Review Comment:
The README states that '-Dskip.docker.build=false' is required in multiple
examples, but this contradicts the Maven profiles configuration which
automatically sets skip.docker.build=false when environment variables like
DOCKER_ID, AWS_ACCOUNT_ID, or AZURE_REGISTRY_NAME are present. The
documentation should clarify that -Dskip.docker.build=false is only needed when
NOT using the environment variable activation, or update the examples to be
consistent with the profile-based activation.
##########
tika-grpc/pom.xml:
##########
@@ -387,6 +424,52 @@
<mainClass>org.apache.tika.pipes.grpc.TikaGrpcServer</mainClass>
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>${maven.exec.version}</version>
+ <executions>
+ <execution>
+ <id>set-chmod-on-docker-build-sh</id>
+ <phase>validate</phase>
+ <goals>
+ <goal>exec</goal>
+ </goals>
+ <configuration>
+ <executable>chmod</executable>
+ <arguments>
+ <argument>+x</argument>
+
<argument>${project.basedir}/docker-build/docker-build.sh</argument>
+ </arguments>
Review Comment:
The chmod execution in the validate phase may fail on Windows systems where
the chmod command is not available. Consider either using a Maven plugin to set
file permissions (like maven-antrun-plugin) or adding a check to skip this
execution on non-Unix systems. Alternatively, the script could be checked into
git with executable permissions already set.
```suggestion
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>3.1.0</version>
<executions>
<!-- Use maven-antrun-plugin to set executable permission in a
cross-platform way -->
<execution>
<id>set-chmod-on-docker-build-sh</id>
<phase>validate</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<chmod
file="${project.basedir}/docker-build/docker-build.sh" perm="755"
failonerror="false"/>
</target>
```
##########
tika-grpc/docker-build/Dockerfile:
##########
@@ -0,0 +1,39 @@
+FROM ubuntu:latest
Review Comment:
Using 'ubuntu:latest' as the base image is not recommended for production
Docker images as it's not pinned to a specific version. This can lead to
unexpected behavior when the 'latest' tag points to a new Ubuntu release. Use a
specific version tag like 'ubuntu:22.04' or 'ubuntu:24.04' to ensure
reproducible builds.
```suggestion
FROM ubuntu:22.04
```
##########
tika-grpc/docker-build/Dockerfile:
##########
@@ -0,0 +1,39 @@
+FROM ubuntu:latest
+COPY libs/ /tika/libs/
+COPY plugins/ /tika/plugins/
+COPY config/ /tika/config/
+COPY bin/ /tika/bin
+ARG JRE='openjdk-17-jre-headless'
+ARG VERSION='4.0.0-SNAPSHOT'
Review Comment:
The ARG VERSION is defined but the value '4.0.0-SNAPSHOT' is hardcoded in
the Dockerfile. This should either be passed as a build argument or the default
should be removed since TIKA_VERSION is already being set from the Maven build.
The hardcoded version could become stale and inconsistent with the actual JAR
version being copied.
```suggestion
ARG VERSION
```
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+ "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+ "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+ package_name=$(basename "$parser_package")
+ jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+ if [ -f "$jar_file" ]; then
+ cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+ else
+ echo "Parser package file $jar_file does not exist, skipping."
+ fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+ aws ecr get-login-password --region ${AWS_REGION} | docker login
--username AWS --password-stdin
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+ IMAGE_TAGS+=("-t
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+ az acr login --name ${AZURE_REGISTRY_NAME}
+ IMAGE_TAGS+=("-t
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+ IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+ echo "No image tags specified, skipping Docker build step. To enable
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment
variables."
+ exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+ echo "Building multi arch image"
+ docker buildx create --name tikabuilder
+ # see
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+ docker run --rm --privileged tonistiigi/binfmt --install amd64
+ docker run --rm --privileged tonistiigi/binfmt --install arm64
+ docker buildx build \
+ --builder=tikabuilder . \
+ ${tag} \
+ --platform linux/amd64,linux/arm64 \
+ --push
Review Comment:
The --push flag on line 103 means the multi-arch build will automatically
push to registries. However, if any of the configured registries are not
properly authenticated, the entire build will fail after spending time building
both architectures. Consider documenting this behavior clearly, or providing an
option to build without pushing for testing purposes.
##########
tika-grpc/pom.xml:
##########
@@ -387,6 +424,52 @@
<mainClass>org.apache.tika.pipes.grpc.TikaGrpcServer</mainClass>
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>${maven.exec.version}</version>
+ <executions>
+ <execution>
+ <id>set-chmod-on-docker-build-sh</id>
+ <phase>validate</phase>
+ <goals>
+ <goal>exec</goal>
+ </goals>
+ <configuration>
+ <executable>chmod</executable>
+ <arguments>
+ <argument>+x</argument>
+
<argument>${project.basedir}/docker-build/docker-build.sh</argument>
+ </arguments>
+ <skip>${skip.docker.build}</skip>
+ </configuration>
+ </execution>
+ <execution>
+ <id>prepare-docker-image</id>
+ <phase>package</phase>
+ <goals>
+ <goal>exec</goal>
+ </goals>
+ <configuration>
+ <executable>bash</executable>
+ <arguments>
+
<argument>${project.basedir}/docker-build/docker-build.sh</argument>
+ </arguments>
+ <environmentVariables>
+ <TIKA_VERSION>${project.version}</TIKA_VERSION>
+ <MULTI_ARCH>${env.MULTI_ARCH}</MULTI_ARCH>
+ <AWS_REGION>${env.AWS_REGION}</AWS_REGION>
+ <AWS_ACCOUNT_ID>${env.AWS_ACCOUNT_ID}</AWS_ACCOUNT_ID>
+
<AZURE_REGISTRY_NAME>${env.AZURE_REGISTRY_NAME}</AZURE_REGISTRY_NAME>
+ <DOCKER_ID>${env.DOCKER_ID}</DOCKER_ID>
+ <PROJECT_NAME>${env.PROJECT_NAME}</PROJECT_NAME>
+ <RELEASE_IMAGE_TAG>${env.RELEASE_IMAGE_TAG}</RELEASE_IMAGE_TAG>
+ </environmentVariables>
+ <skip>${skip.docker.build}</skip>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
Review Comment:
There are two separate exec-maven-plugin declarations in this file. The
first one (lines 412-426) and this one (lines 427-472) should be merged into a
single plugin configuration with all executions combined. Having duplicate
plugin declarations can cause unexpected behavior and makes the configuration
harder to maintain.
##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+ echo "Environment variable TIKA_VERSION is required, and should match the
maven project version of Tika"
+ exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+ RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+ ## Remove '-SNAPSHOT' from the version string
+ RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+ plugin_name=$(basename "$dir")
+ zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+ if [ -f "$zip_file" ]; then
+ cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+ else
+ echo "Plugin file $zip_file does not exist, skipping."
+ fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+ "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+ "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+ "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+ package_name=$(basename "$parser_package")
+ jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+ if [ -f "$jar_file" ]; then
+ cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+ else
+ echo "Parser package file $jar_file does not exist, skipping."
+ fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+ aws ecr get-login-password --region ${AWS_REGION} | docker login
--username AWS --password-stdin
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+ IMAGE_TAGS+=("-t
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+ az acr login --name ${AZURE_REGISTRY_NAME}
+ IMAGE_TAGS+=("-t
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+ IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+ echo "No image tags specified, skipping Docker build step. To enable
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment
variables."
+ exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+ echo "Building multi arch image"
+ docker buildx create --name tikabuilder
+ # see
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+ docker run --rm --privileged tonistiigi/binfmt --install amd64
+ docker run --rm --privileged tonistiigi/binfmt --install arm64
Review Comment:
The multi-arch build path runs the third-party image `tonistiigi/binfmt`
with the `--privileged` flag and without pinning it to a specific version or
digest. If this Docker Hub image is ever compromised or replaced, anyone
running multi-arch builds could execute attacker-controlled code on the build
host with elevated privileges and expose registry/cloud credentials used during
the build. To reduce this supply-chain risk, pin `tonistiigi/binfmt` to a
trusted immutable digest (or vetted version) and avoid `--privileged` if
possible (e.g., use a narrower capability set or an alternative approach for
enabling binfmt).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]