Copilot commented on code in PR #2462:
URL: https://github.com/apache/tika/pull/2462#discussion_r2624780551


##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+    IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+    echo "No image tags specified, skipping Docker build step. To enable 
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment 
variables."
+    exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+  echo "Building multi arch image"
+  docker buildx create --name tikabuilder
+  # see 
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+  docker run --rm --privileged tonistiigi/binfmt --install amd64
+  docker run --rm --privileged tonistiigi/binfmt --install arm64
+  docker buildx build \
+      --builder=tikabuilder . \
+      ${tag} \
+      --platform linux/amd64,linux/arm64 \
+      --push
+  docker buildx stop tikabuilder

Review Comment:
   The buildx builder is stopped but not removed after use. This will leave the 
builder instance around, potentially causing issues on subsequent runs. Add 
'docker buildx rm tikabuilder' after the stop command to clean up properly.
   ```suggestion
     docker buildx stop tikabuilder
     docker buildx rm tikabuilder
   ```



##########
tika-grpc/docker-build/start-tika-grpc.sh:
##########
@@ -0,0 +1,29 @@
+#!/bin/sh

Review Comment:
   The shebang uses '/bin/sh' but the script may use bash-specific features. 
While this script appears to be POSIX-compliant, for consistency with 
docker-build.sh which uses '#!/bin/bash', consider using the same shebang here 
or ensuring the script remains strictly POSIX-compliant.
   ```suggestion
   #!/bin/bash
   ```



##########
tika-grpc/pom.xml:
##########
@@ -38,7 +38,44 @@
     <asarkar-grpc-test.version>2.0.0</asarkar-grpc-test.version>
     <awaitility.version>4.3.0</awaitility.version>
     <j2objc-annotations.version>3.1</j2objc-annotations.version>
+    <skip.docker.build>true</skip.docker.build>
   </properties>
+  
+  <profiles>
+    <profile>
+      <id>enable-docker-build-aws</id>
+      <activation>
+        <property>
+          <name>env.AWS_ACCOUNT_ID</name>
+        </property>
+      </activation>
+      <properties>
+        <skip.docker.build>false</skip.docker.build>
+      </properties>
+    </profile>
+    <profile>
+      <id>enable-docker-build-azure</id>
+      <activation>
+        <property>
+          <name>env.AZURE_REGISTRY_NAME</name>
+        </property>
+      </activation>
+      <properties>
+        <skip.docker.build>false</skip.docker.build>
+      </properties>
+    </profile>
+    <profile>
+      <id>enable-docker-build-dockerhub</id>
+      <activation>
+        <property>
+          <name>env.DOCKER_ID</name>
+        </property>
+      </activation>
+      <properties>
+        <skip.docker.build>false</skip.docker.build>
+      </properties>
+    </profile>

Review Comment:
   The Maven profiles are activated based on environment variable presence, but 
the documentation suggests passing -Dskip.docker.build=false on the command 
line. This creates two different activation mechanisms which could be 
confusing. If env.DOCKER_ID is set, the profile activates and sets 
skip.docker.build=false, but the documentation also shows using 
-Dskip.docker.build=false explicitly. Consider documenting that either the 
environment variables OR the property can be used, and clarify the precedence 
between them.



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")

Review Comment:
   The AWS CLI and Azure CLI login commands are missing error handling. If the 
login fails (e.g., credentials are not configured), the script will continue 
and the subsequent docker build command will fail with a potentially confusing 
error. Add error checking after each login command to exit if authentication 
fails.



##########
tika-grpc/docker-build/README.md:
##########
@@ -0,0 +1,170 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC 
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root (this builds all modules including 
plugins):
+```bash
+mvn clean install -DskipTests
+```
+
+### Option 1: Run Docker Build During Maven Package
+
+The Docker build can be triggered automatically during the Maven package phase:
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
+```
+
+**Note:** By default, `skip.docker.build=true` to avoid running Docker builds 
during normal development.
+
+#### Controlling Docker Build with Environment Variables
+
+All docker-build.sh environment variables are passed through from your shell:
+
+```bash
+# Build and push to Docker Hub
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc 
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+  mvn clean package -Dskip.docker.build=false
+```

Review Comment:
   The example shows running 'mvn package' but the script copies files from 
'tika-pipes/tika-pipes-plugins/*' and 'tika-parsers/*' which are sibling 
directories to tika-grpc. These files won't exist unless a full build from the 
root is performed. The example should clarify that users need to run 'mvn clean 
install -DskipTests' from the project root first, or update the example to show 
the command being run from the root with '-f tika-grpc' as shown in the PR 
description.



##########
tika-grpc/README.md:
##########
@@ -11,3 +11,59 @@ This server will manage a pool of Tika Pipes clients.
     * Delete
 * Fetch + Parse a given Fetch Item
 
+## Building
+
+### Standard Build
+
+```bash
+mvn clean install -DskipTests
+```
+
+### Building Docker Image
+
+The tika-grpc module includes Docker build support. See 
[docker-build/README.md](docker-build/README.md) for complete documentation.
+
+#### Quick Start - Docker Build
+
+**Option 1: Build with Maven (recommended)**
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false

Review Comment:
   The example on line 37 shows running 'mvn clean install -DskipTests 
-Dskip.docker.build=false' from the project root, but this will build ALL Tika 
modules with Docker builds enabled. This contradicts the stated purpose of just 
building the tika-grpc module. The example should either add '-pl tika-grpc 
-am' to only build tika-grpc and its dependencies, or clarify that this builds 
the entire project.
   ```suggestion
   mvn clean install -DskipTests -Dskip.docker.build=false -pl tika-grpc -am
   ```



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+    IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+    echo "No image tags specified, skipping Docker build step. To enable 
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment 
variables."
+    exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+  echo "Building multi arch image"
+  docker buildx create --name tikabuilder
+  # see 
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+  docker run --rm --privileged tonistiigi/binfmt --install amd64
+  docker run --rm --privileged tonistiigi/binfmt --install arm64
+  docker buildx build \
+      --builder=tikabuilder . \
+      ${tag} \
+      --platform linux/amd64,linux/arm64 \
+      --push
+  docker buildx stop tikabuilder
+else
+  echo "Building single arch image"
+  # build single arch
+  docker build . ${tag}

Review Comment:
   The variable expansion on line 92 using "${IMAGE_TAGS[*]}" will concatenate 
all tags into a single space-separated string, which works for the docker build 
command on line 108. However, this is fragile - if any tag contains spaces, it 
will break. While the current usage appears correct, consider using a loop to 
build the docker command with proper quoting for better robustness, or add 
validation to ensure tags don't contain spaces.



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+    IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi

Review Comment:
   Variables used in shell commands should be quoted to prevent word splitting 
and glob expansion. The variables AWS_REGION, AWS_ACCOUNT_ID, 
AZURE_REGISTRY_NAME, DOCKER_ID, PROJECT_NAME, and RELEASE_IMAGE_TAG should be 
wrapped in double quotes throughout the script. For example, line 74 should use 
"${AWS_REGION}" instead of ${AWS_REGION}.



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+

Review Comment:
   The script copies plugin files from 'tika-pipes/tika-pipes-plugins/*/' but 
if a plugin doesn't exist or hasn't been built, it silently skips it with just 
an echo message. This could lead to missing plugins in the Docker image without 
a clear indication that something went wrong. Consider making the build fail or 
at least provide a warning if expected plugins are missing, especially for 
critical plugins.
   ```suggestion
           echo "WARNING: Plugin file $zip_file does not exist, skipping."
       fi
   done
   
   # List of required (critical) plugins - add plugin directory names (not full 
path)
   REQUIRED_PLUGINS=(
       # Add critical plugin names here, e.g.:
       # "tika-pipes-example-plugin"
   )
   
   # Check for required plugin zip files
   for plugin_name in "${REQUIRED_PLUGINS[@]}"; do
       
zip_file="tika-pipes/tika-pipes-plugins/${plugin_name}/target/${plugin_name}-${TIKA_VERSION}.zip"
       if [ ! -f "$zip_file" ]; then
           echo "ERROR: Required plugin file $zip_file is missing. Failing 
build."
           exit 1
       fi
   done
   ```



##########
tika-grpc/docker-build/README.md:
##########
@@ -0,0 +1,170 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC 
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root (this builds all modules including 
plugins):
+```bash
+mvn clean install -DskipTests
+```
+
+### Option 1: Run Docker Build During Maven Package
+
+The Docker build can be triggered automatically during the Maven package phase:
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
+```
+
+**Note:** By default, `skip.docker.build=true` to avoid running Docker builds 
during normal development.
+
+#### Controlling Docker Build with Environment Variables
+
+All docker-build.sh environment variables are passed through from your shell:
+
+```bash
+# Build and push to Docker Hub
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc 
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+  mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build multi-arch and push to Docker Hub
+MULTI_ARCH=true DOCKER_ID=myusername PROJECT_NAME=tika-grpc \
+  mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to AWS ECR
+AWS_ACCOUNT_ID=123456789012 AWS_REGION=us-east-1 \
+  mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to Azure Container Registry
+AZURE_REGISTRY_NAME=myregistry \
+  mvn clean package -Dskip.docker.build=false
+```
+
+### Option 2: Run the Docker Build Script Manually
+
+Set the required environment variable and run the script:
+
+```bash
+export TIKA_VERSION=4.0.0-SNAPSHOT
+./tika-grpc/docker-build/docker-build.sh
+```
+
+### Optional Environment Variables
+
+- `TIKA_VERSION`: Maven project version (required)
+- `RELEASE_IMAGE_TAG`: Override the default tag (defaults to TIKA_VERSION 
without -SNAPSHOT)
+- `DOCKER_ID`: Docker Hub username to push to Docker Hub
+- `AWS_ACCOUNT_ID`: AWS account ID to push to ECR
+- `AWS_REGION`: AWS region for ECR (default: us-west-2)
+- `AZURE_REGISTRY_NAME`: Azure Container Registry name
+- `MULTI_ARCH`: Build for multiple architectures (default: false)
+- `PROJECT_NAME`: Docker image name (default: tika-grpc)
+
+### Examples
+
+**Maven build with Docker Hub (recommended):**
+```bash
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc 
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+  mvn clean package -Dskip.docker.build=false
+```
+
+**Maven build with multi-arch:**
+```bash
+MULTI_ARCH=true DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc \
+  mvn clean package -Dskip.docker.build=false
+```
+
+**Maven build with AWS ECR:**
+```bash
+AWS_ACCOUNT_ID=123456789012 AWS_REGION=us-east-1 \
+  mvn clean package -Dskip.docker.build=false
+```

Review Comment:
   The README examples use 'mvn clean package' but 'clean' is unnecessary when 
running just the package phase and could slow down the build. Either remove 
'clean' from these examples or explain why it's needed. The examples on lines 
92, 98, and 104 should be consistent with the approach shown on line 30.



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+

Review Comment:
   The script does not check if Docker is installed before attempting to run 
docker commands. If Docker is not available, the script will fail with a 
potentially unclear error message. Consider adding a check at the beginning of 
the script to verify Docker is installed and provide a clear error message if 
it's not.



##########
tika-grpc/docker-build/README.md:
##########
@@ -0,0 +1,170 @@
+# Tika gRPC Docker Build
+
+This directory contains the Docker build configuration for Apache Tika gRPC 
server.
+
+## Overview
+
+The Docker image includes:
+- Tika gRPC server JAR
+- All Tika Pipes plugins (fetchers, emitters, iterators)
+- Parser packages (standard, extended, ML)
+- OCR support (Tesseract with multiple languages)
+- GDAL for geospatial formats
+- Common fonts
+
+## Building the Docker Image
+
+### Prerequisites
+
+1. Build Tika from the project root (this builds all modules including 
plugins):
+```bash
+mvn clean install -DskipTests
+```
+
+### Option 1: Run Docker Build During Maven Package
+
+The Docker build can be triggered automatically during the Maven package phase:
+
+```bash
+cd tika-grpc
+mvn package -Dskip.docker.build=false
+```
+
+Or from the project root:
+```bash
+mvn clean install -DskipTests -Dskip.docker.build=false
+```
+
+**Note:** By default, `skip.docker.build=true` to avoid running Docker builds 
during normal development.
+
+#### Controlling Docker Build with Environment Variables
+
+All docker-build.sh environment variables are passed through from your shell:
+
+```bash
+# Build and push to Docker Hub
+MULTI_ARCH=false DOCKER_ID=ndipiazza PROJECT_NAME=tika-grpc 
RELEASE_IMAGE_TAG=4.0.0-SNAPSHOT \
+  mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build multi-arch and push to Docker Hub
+MULTI_ARCH=true DOCKER_ID=myusername PROJECT_NAME=tika-grpc \
+  mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to AWS ECR
+AWS_ACCOUNT_ID=123456789012 AWS_REGION=us-east-1 \
+  mvn clean package -Dskip.docker.build=false
+```
+
+```bash
+# Build and push to Azure Container Registry
+AZURE_REGISTRY_NAME=myregistry \
+  mvn clean package -Dskip.docker.build=false
+```

Review Comment:
   The README states that '-Dskip.docker.build=false' is required in multiple 
examples, but this contradicts the Maven profiles configuration which 
automatically sets skip.docker.build=false when environment variables like 
DOCKER_ID, AWS_ACCOUNT_ID, or AZURE_REGISTRY_NAME are present. The 
documentation should clarify that -Dskip.docker.build=false is only needed when 
NOT using the environment variable activation, or update the examples to be 
consistent with the profile-based activation.



##########
tika-grpc/pom.xml:
##########
@@ -387,6 +424,52 @@
           <mainClass>org.apache.tika.pipes.grpc.TikaGrpcServer</mainClass>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>${maven.exec.version}</version>
+        <executions>
+          <execution>
+            <id>set-chmod-on-docker-build-sh</id>
+            <phase>validate</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>chmod</executable>
+              <arguments>
+                <argument>+x</argument>
+                
<argument>${project.basedir}/docker-build/docker-build.sh</argument>
+              </arguments>

Review Comment:
   The chmod execution in the validate phase may fail on Windows systems where 
the chmod command is not available. Consider either using a Maven plugin to set 
file permissions (like maven-antrun-plugin) or adding a check to skip this 
execution on non-Unix systems. Alternatively, the script could be checked into 
git with executable permissions already set.
   ```suggestion
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-antrun-plugin</artifactId>
           <version>3.1.0</version>
           <executions>
             <!-- Use maven-antrun-plugin to set executable permission in a 
cross-platform way -->
             <execution>
               <id>set-chmod-on-docker-build-sh</id>
               <phase>validate</phase>
               <goals>
                 <goal>run</goal>
               </goals>
               <configuration>
                 <target>
                   <chmod 
file="${project.basedir}/docker-build/docker-build.sh" perm="755" 
failonerror="false"/>
                 </target>
   ```



##########
tika-grpc/docker-build/Dockerfile:
##########
@@ -0,0 +1,39 @@
+FROM ubuntu:latest

Review Comment:
   Using 'ubuntu:latest' as the base image is not recommended for production 
Docker images as it's not pinned to a specific version. This can lead to 
unexpected behavior when the 'latest' tag points to a new Ubuntu release. Use a 
specific version tag like 'ubuntu:22.04' or 'ubuntu:24.04' to ensure 
reproducible builds.
   ```suggestion
   FROM ubuntu:22.04
   ```



##########
tika-grpc/docker-build/Dockerfile:
##########
@@ -0,0 +1,39 @@
+FROM ubuntu:latest
+COPY libs/ /tika/libs/
+COPY plugins/ /tika/plugins/
+COPY config/ /tika/config/
+COPY bin/ /tika/bin
+ARG JRE='openjdk-17-jre-headless'
+ARG VERSION='4.0.0-SNAPSHOT'

Review Comment:
   The ARG VERSION is defined but the value '4.0.0-SNAPSHOT' is hardcoded in 
the Dockerfile. This should either be passed as a build argument or the default 
should be removed since TIKA_VERSION is already being set from the Maven build. 
The hardcoded version could become stale and inconsistent with the actual JAR 
version being copied.
   ```suggestion
   ARG VERSION
   ```



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+    IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+    echo "No image tags specified, skipping Docker build step. To enable 
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment 
variables."
+    exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+  echo "Building multi arch image"
+  docker buildx create --name tikabuilder
+  # see 
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+  docker run --rm --privileged tonistiigi/binfmt --install amd64
+  docker run --rm --privileged tonistiigi/binfmt --install arm64
+  docker buildx build \
+      --builder=tikabuilder . \
+      ${tag} \
+      --platform linux/amd64,linux/arm64 \
+      --push

Review Comment:
   The --push flag on line 103 means the multi-arch build will automatically 
push to registries. However, if any of the configured registries are not 
properly authenticated, the entire build will fail after spending time building 
both architectures. Consider documenting this behavior clearly, or providing an 
option to build without pushing for testing purposes.



##########
tika-grpc/pom.xml:
##########
@@ -387,6 +424,52 @@
           <mainClass>org.apache.tika.pipes.grpc.TikaGrpcServer</mainClass>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>${maven.exec.version}</version>
+        <executions>
+          <execution>
+            <id>set-chmod-on-docker-build-sh</id>
+            <phase>validate</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>chmod</executable>
+              <arguments>
+                <argument>+x</argument>
+                
<argument>${project.basedir}/docker-build/docker-build.sh</argument>
+              </arguments>
+              <skip>${skip.docker.build}</skip>
+            </configuration>
+          </execution>
+          <execution>
+            <id>prepare-docker-image</id>
+            <phase>package</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>bash</executable>
+              <arguments>
+                
<argument>${project.basedir}/docker-build/docker-build.sh</argument>
+              </arguments>
+              <environmentVariables>
+                <TIKA_VERSION>${project.version}</TIKA_VERSION>
+                <MULTI_ARCH>${env.MULTI_ARCH}</MULTI_ARCH>
+                <AWS_REGION>${env.AWS_REGION}</AWS_REGION>
+                <AWS_ACCOUNT_ID>${env.AWS_ACCOUNT_ID}</AWS_ACCOUNT_ID>
+                
<AZURE_REGISTRY_NAME>${env.AZURE_REGISTRY_NAME}</AZURE_REGISTRY_NAME>
+                <DOCKER_ID>${env.DOCKER_ID}</DOCKER_ID>
+                <PROJECT_NAME>${env.PROJECT_NAME}</PROJECT_NAME>
+                <RELEASE_IMAGE_TAG>${env.RELEASE_IMAGE_TAG}</RELEASE_IMAGE_TAG>
+              </environmentVariables>
+              <skip>${skip.docker.build}</skip>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>

Review Comment:
   There are two separate exec-maven-plugin declarations in this file. The 
first one (lines 412-426) and this one (lines 427-472) should be merged into a 
single plugin configuration with all executions combined. Having duplicate 
plugin declarations can cause unexpected behavior and makes the configuration 
harder to maintain.



##########
tika-grpc/docker-build/docker-build.sh:
##########
@@ -0,0 +1,113 @@
+#!/bin/bash
+# This script is intended to be run from Maven exec plugin during the package 
phase of maven build
+
+if [ -z "${TIKA_VERSION}" ]; then
+    echo "Environment variable TIKA_VERSION is required, and should match the 
maven project version of Tika"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cd "${SCRIPT_DIR}/../../" || exit
+
+OUT_DIR=target/tika-docker
+
+MULTI_ARCH=${MULTI_ARCH:-false}
+AWS_REGION=${AWS_REGION:-us-west-2}
+AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-}
+AZURE_REGISTRY_NAME=${AZURE_REGISTRY_NAME:-}
+DOCKER_ID=${DOCKER_ID:-}
+PROJECT_NAME=${PROJECT_NAME:-tika-grpc}
+
+# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION
+if [[ -z "${RELEASE_IMAGE_TAG}" ]]; then
+    RELEASE_IMAGE_TAG="${TIKA_VERSION}"
+    ## Remove '-SNAPSHOT' from the version string
+    RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}"
+fi
+
+mkdir -p "${OUT_DIR}/libs"
+mkdir -p "${OUT_DIR}/plugins"
+mkdir -p "${OUT_DIR}/config"
+mkdir -p "${OUT_DIR}/bin"
+cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs"
+
+# Copy all tika-pipes plugin zip files
+for dir in tika-pipes/tika-pipes-plugins/*/; do
+    plugin_name=$(basename "$dir")
+    zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip"
+    if [ -f "$zip_file" ]; then
+        cp -v -r "$zip_file" "${OUT_DIR}/plugins"
+    else
+        echo "Plugin file $zip_file does not exist, skipping."
+    fi
+done
+
+# Copy parser package jars as plugins
+parser_packages=(
+    "tika-parsers/tika-parsers-standard/tika-parsers-standard-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-scientific-package"
+    "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package"
+    "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"
+)
+
+for parser_package in "${parser_packages[@]}"; do
+    package_name=$(basename "$parser_package")
+    jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar"
+    if [ -f "$jar_file" ]; then
+        cp -v -r "$jar_file" "${OUT_DIR}/plugins"
+    else
+        echo "Parser package file $jar_file does not exist, skipping."
+    fi
+done
+
+cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin"
+
+cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile"
+
+cd "${OUT_DIR}" || exit
+
+echo "Running docker build from directory: $(pwd)"
+
+IMAGE_TAGS=()
+if [[ -n "${AWS_ACCOUNT_ID}" ]]; then
+    aws ecr get-login-password --region ${AWS_REGION} | docker login 
--username AWS --password-stdin 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+    IMAGE_TAGS+=("-t 
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${AZURE_REGISTRY_NAME}" ]]; then
+    az acr login --name ${AZURE_REGISTRY_NAME}
+    IMAGE_TAGS+=("-t 
${AZURE_REGISTRY_NAME}.azurecr.io/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [[ -n "${DOCKER_ID}" ]]; then
+    IMAGE_TAGS+=("-t ${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}")
+fi
+
+if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then
+    echo "No image tags specified, skipping Docker build step. To enable 
build, set AWS_ACCOUNT_ID, AZURE_REGISTRY_NAME, and/or DOCKER_ID environment 
variables."
+    exit 0
+fi
+
+tag="${IMAGE_TAGS[*]}"
+if [ "${MULTI_ARCH}" == "true" ]; then
+  echo "Building multi arch image"
+  docker buildx create --name tikabuilder
+  # see 
https://askubuntu.com/questions/1339558/cant-build-dockerfile-for-arm64-due-to-libc-bin-segmentation-fault/1398147#1398147
+  docker run --rm --privileged tonistiigi/binfmt --install amd64
+  docker run --rm --privileged tonistiigi/binfmt --install arm64

Review Comment:
   The multi-arch build path runs the third-party image `tonistiigi/binfmt` 
with the `--privileged` flag and without pinning it to a specific version or 
digest. If this Docker Hub image is ever compromised or replaced, anyone 
running multi-arch builds could execute attacker-controlled code on the build 
host with elevated privileges and expose registry/cloud credentials used during 
the build. To reduce this supply-chain risk, pin `tonistiigi/binfmt` to a 
trusted immutable digest (or vetted version) and avoid `--privileged` if 
possible (e.g., use a narrower capability set or an alternative approach for 
enabling binfmt).
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to