This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new c46dbf203a9 HIVE-29346: Optimize the HMS Docker image and enable S3
support (#6215)
c46dbf203a9 is described below
commit c46dbf203a9b7f08b1eb448773d1ba54e045fa9b
Author: Denys Kuzmenko <[email protected]>
AuthorDate: Fri Nov 28 12:49:47 2025 +0100
HIVE-29346: Optimize the HMS Docker image and enable S3 support (#6215)
---
.../hive/metastore/HiveProtoEventsCleanerTask.java | 10 ++--
packaging/src/docker/Dockerfile | 55 +++++++++--------
packaging/src/docker/README.md | 24 ++++++++
.../src/docker/conf/core-site.xml.template | 16 ++---
.../conf/{hive-site.xml => hive-site.xml.template} | 2 +-
packaging/src/docker/docker-compose.yml | 26 +++++++--
packaging/src/docker/entrypoint.sh | 25 ++++++++
.../packaging/src/docker/Dockerfile | 59 +++++++++++++------
.../packaging/src/docker/README.md | 68 ++++++++++++++--------
.../{metastore-site.xml => core-site.xml.template} | 16 ++---
...astore-site.xml => metastore-site.xml.template} | 2 +-
.../packaging/src/docker/docker-compose.yml | 19 ++++--
.../packaging/src/docker/entrypoint.sh | 27 ++++++++-
13 files changed, 248 insertions(+), 101 deletions(-)
diff --git
a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveProtoEventsCleanerTask.java
b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveProtoEventsCleanerTask.java
index 2a772e2e0f4..f69a16855b0 100644
---
a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveProtoEventsCleanerTask.java
+++
b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveProtoEventsCleanerTask.java
@@ -26,14 +26,13 @@
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.security.UserGroupInformation;
-import org.apache.hadoop.yarn.util.SystemClock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
+import java.time.Instant;
import java.time.LocalDate;
-import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
@@ -48,7 +47,6 @@ public class HiveProtoEventsCleanerTask implements
MetastoreTaskThread {
private Configuration conf;
private long ttl;
private static String expiredDatePtn = null;
- private static final SystemClock clock = SystemClock.getInstance();
@Override
public void setConf(Configuration conf) {
@@ -95,9 +93,11 @@ public void run() {
* Compute the expired date partition, using the underlying clock in UTC
time.
*/
private static void computeExpiredDatePtn(long ttl) {
+ LocalDate expiredDate = LocalDate.ofInstant(
+ Instant.now().minusMillis(ttl),
+ ZoneOffset.UTC
+ );
// Use UTC date to ensure reader date is same on all timezones.
- LocalDate expiredDate
- = LocalDateTime.ofEpochSecond((clock.getTime() - ttl) / 1000, 0,
ZoneOffset.UTC).toLocalDate();
expiredDatePtn = "date=" +
DateTimeFormatter.ISO_LOCAL_DATE.format(expiredDate);
}
diff --git a/packaging/src/docker/Dockerfile b/packaging/src/docker/Dockerfile
index fc39ac70d76..2920e5d0df0 100644
--- a/packaging/src/docker/Dockerfile
+++ b/packaging/src/docker/Dockerfile
@@ -50,27 +50,38 @@ ARG HADOOP_VERSION
ARG HIVE_VERSION
ARG TEZ_VERSION
-RUN tar -xzvf /opt/hadoop-$HADOOP_VERSION.tar.gz -C /opt/ && \
- rm -rf /opt/hadoop-$HADOOP_VERSION/share/doc/* && \
- tar -xzvf /opt/apache-hive-$HIVE_VERSION-bin.tar.gz -C /opt/ && \
- rm -rf /opt/apache-hive-$HIVE_VERSION-bin/jdbc/* && \
- tar -xzvf /opt/apache-tez-$TEZ_VERSION-bin.tar.gz -C /opt && \
- rm -rf /opt/apache-tez-$TEZ_VERSION-bin/share/*
+RUN tar -xzv \
+ --exclude="hadoop-$HADOOP_VERSION/share/doc" \
+ --exclude="*/jdiff" \
+ --exclude="*/sources" \
+ --exclude="*tests.jar" \
+ --exclude="*/webapps" \
+ -f /opt/hadoop-$HADOOP_VERSION.tar.gz \
+ -C /opt/ && \
+ # INSTALL HIVE
+ tar -xzv \
+ --exclude="apache-hive-$HIVE_VERSION-bin/jdbc" \
+ -f /opt/apache-hive-$HIVE_VERSION-bin.tar.gz \
+ -C /opt/ && \
+ # INSTALL TEZ
+ tar -xzv \
+ --exclude="apache-tez-$TEZ_VERSION-bin/share" \
+ -f /opt/apache-tez-$TEZ_VERSION-bin.tar.gz \
+ -C /opt
FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run
+ARG UID=1000
ARG HADOOP_VERSION
ARG HIVE_VERSION
ARG TEZ_VERSION
-COPY --from=env /opt/hadoop-$HADOOP_VERSION /opt/hadoop
-COPY --from=env /opt/apache-hive-$HIVE_VERSION-bin /opt/hive
-COPY --from=env /opt/apache-tez-$TEZ_VERSION-bin /opt/tez
# Install dependencies
RUN set -ex; \
microdnf update -y; \
- microdnf -y install procps; \
- rm -rf /var/lib/apt/lists/*
+ microdnf -y install procps gettext; \
+ microdnf clean all; \
+ useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive
# Set necessary environment variables.
ENV HADOOP_HOME=/opt/hadoop \
@@ -80,23 +91,21 @@ ENV HADOOP_HOME=/opt/hadoop \
ENV PATH=$HIVE_HOME/bin:$HADOOP_HOME/bin:$PATH
-COPY entrypoint.sh /
-COPY conf $HIVE_HOME/conf
-RUN chmod +x /entrypoint.sh
+COPY --from=env --chown=hive /opt/hadoop-$HADOOP_VERSION $HADOOP_HOME
+COPY --from=env --chown=hive /opt/apache-hive-$HIVE_VERSION-bin $HIVE_HOME
+COPY --from=env --chown=hive /opt/apache-tez-$TEZ_VERSION-bin $TEZ_HOME
+COPY --chown=hive entrypoint.sh /
+COPY --chown=hive conf $HIVE_HOME/conf
-ARG UID=1000
-RUN useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive && \
- chown hive /opt/tez && \
- chown hive /opt/hive && \
- chown hive /opt/hadoop && \
- chown hive /opt/hive/conf && \
- mkdir -p /opt/hive/data/warehouse && \
- chown hive /opt/hive/data/warehouse && \
+RUN chmod +x /entrypoint.sh && \
+ mkdir -p $HIVE_HOME/data/warehouse && \
+ chown hive $HIVE_HOME/data/warehouse && \
mkdir -p /home/hive/.beeline && \
chown hive /home/hive/.beeline
USER hive
-WORKDIR /opt/hive
+WORKDIR $HIVE_HOME
EXPOSE 10000 10002 9083
+
ENTRYPOINT ["sh", "-c", "/entrypoint.sh"]
diff --git a/packaging/src/docker/README.md b/packaging/src/docker/README.md
index bfed5ce40fc..9e2d37c57fe 100644
--- a/packaging/src/docker/README.md
+++ b/packaging/src/docker/README.md
@@ -306,3 +306,27 @@ docker compose exec hiveserver2-standalone /bin/bash
/opt/hive/bin/schematool -initSchema -dbType hive -metaDbType postgres -url
jdbc:hive2://localhost:10000/default
exit
```
+
+#### Hive with S3-backed warehouse storage
+
+1. Download the AWS SDK bundle and place it under jars/ directory.
+
+**Disclaimer:**
+Hadoop **3.4.1** requires **AWS SDK v2**.
+```shell
+wget
https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.26.19/bundle-2.26.19.jar
-P jars/
+```
+
+2. Set the following environment variables:
+- AWS_ACCESS_KEY_ID
+- AWS_SECRET_ACCESS_KEY
+- DEFAULT_FS
+- HIVE_WAREHOUSE_PATH
+- S3_ENDPOINT_URL
+
+```shell
+DEFAULT_FS="s3a://dw-team-bucket" \
+HIVE_WAREHOUSE_PATH="/data/warehouse/tablespace/managed/hive" \
+S3_ENDPOINT_URL="s3.us-west-2.amazonaws.com" \
+docker-compose up
+```
diff --git a/standalone-metastore/packaging/src/docker/conf/metastore-site.xml
b/packaging/src/docker/conf/core-site.xml.template
similarity index 73%
copy from standalone-metastore/packaging/src/docker/conf/metastore-site.xml
copy to packaging/src/docker/conf/core-site.xml.template
index bb2aec2a931..f15441157b5 100644
--- a/standalone-metastore/packaging/src/docker/conf/metastore-site.xml
+++ b/packaging/src/docker/conf/core-site.xml.template
@@ -17,19 +17,15 @@
-->
<configuration>
<property>
- <name>metastore.warehouse.dir</name>
- <value>/opt/hive/data/warehouse</value>
+ <name>fs.s3a.endpoint</name>
+ <value>${S3_ENDPOINT_URL}</value>
</property>
<property>
- <name>metastore.event.db.notification.api.auth</name>
- <value>false</value>
+ <name>fs.s3a.access.key</name>
+ <value>${AWS_ACCESS_KEY_ID}</value>
</property>
<property>
- <name>metastore.catalog.servlet.port</name>
- <value>9001</value>
- </property>
- <property>
- <name>metastore.catalog.servlet.auth</name>
- <value>none</value>
+ <name>fs.s3a.secret.key</name>
+ <value>${AWS_SECRET_ACCESS_KEY}</value>
</property>
</configuration>
diff --git a/packaging/src/docker/conf/hive-site.xml
b/packaging/src/docker/conf/hive-site.xml.template
similarity index 97%
rename from packaging/src/docker/conf/hive-site.xml
rename to packaging/src/docker/conf/hive-site.xml.template
index 8dc9b81064b..5639ac185c7 100644
--- a/packaging/src/docker/conf/hive-site.xml
+++ b/packaging/src/docker/conf/hive-site.xml.template
@@ -58,7 +58,7 @@
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
- <value>/opt/hive/data/warehouse</value>
+ <value>${DEFAULT_FS}${HIVE_WAREHOUSE_PATH}</value>
</property>
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
diff --git a/packaging/src/docker/docker-compose.yml
b/packaging/src/docker/docker-compose.yml
index f991568a164..a098c2b767c 100644
--- a/packaging/src/docker/docker-compose.yml
+++ b/packaging/src/docker/docker-compose.yml
@@ -40,12 +40,21 @@ services:
container_name: metastore
hostname: metastore
environment:
+ DEFAULT_FS: "${DEFAULT_FS}"
+ HIVE_WAREHOUSE_PATH: "${HIVE_WAREHOUSE_PATH}"
+ HADOOP_CLASSPATH: /opt/hadoop/share/hadoop/tools/lib/*
DB_DRIVER: postgres
SERVICE_NAME: 'metastore'
- SERVICE_OPTS: '-Xmx1G
-Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
-
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
- -Djavax.jdo.option.ConnectionUserName=hive
- -Djavax.jdo.option.ConnectionPassword=password'
+ SERVICE_OPTS: >
+ -Xmx1G
+ -Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
+
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
+ -Djavax.jdo.option.ConnectionUserName=hive
+ -Djavax.jdo.option.ConnectionPassword=password
+
+ S3_ENDPOINT_URL: "${S3_ENDPOINT_URL}"
+ AWS_ACCESS_KEY_ID: "${AWS_ACCESS_KEY_ID}"
+ AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}"
ports:
- '9083:9083'
volumes:
@@ -53,6 +62,8 @@ services:
- type: bind
source: ${POSTGRES_LOCAL_PATH}
target: /opt/hive/lib/postgres.jar
+ # Mount local jars to a temporary staging area (Read-Only)
+ - ./jars:/tmp/ext-jars:ro
networks:
- hive
@@ -63,15 +74,22 @@ services:
restart: unless-stopped
container_name: hiveserver2
environment:
+ HADOOP_CLASSPATH: /opt/hadoop/share/hadoop/tools/lib/*
HIVE_SERVER2_THRIFT_PORT: 10000
SERVICE_OPTS: '-Xmx1G -Dhive.metastore.uris=thrift://metastore:9083'
IS_RESUME: 'true'
SERVICE_NAME: 'hiveserver2'
+
+ S3_ENDPOINT_URL: "${S3_ENDPOINT_URL}"
+ AWS_ACCESS_KEY_ID: "${AWS_ACCESS_KEY_ID}"
+ AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}"
ports:
- '10000:10000'
- '10002:10002'
volumes:
- warehouse:/opt/hive/data/warehouse
+ # Mount local jars to a temporary staging area (Read-Only)
+ - ./jars:/tmp/ext-jars:ro
networks:
- hive
diff --git a/packaging/src/docker/entrypoint.sh
b/packaging/src/docker/entrypoint.sh
index 10d69e67ce9..3ae61409985 100644
--- a/packaging/src/docker/entrypoint.sh
+++ b/packaging/src/docker/entrypoint.sh
@@ -19,6 +19,31 @@
set -x
+# =========================================================================
+# DYNAMIC JAR LOADER (AWS/S3 Support)
+# =========================================================================
+STAGING_DIR="/tmp/ext-jars"
+
+# Checks if /tmp/ext-jars is mounted (via Docker volume).
+if [ -d "$STAGING_DIR" ]; then
+ if ls "$STAGING_DIR"/*.jar 1> /dev/null 2>&1; then
+ echo "--> Copying custom jars from volume to Hive..."
+ cp -vf "$STAGING_DIR"/*.jar "${HIVE_HOME}/lib/"
+ else
+ echo "--> Volume mounted at $STAGING_DIR, but no jars found."
+ fi
+fi
+
+# =========================================================================
+# REPLACE ${VARS} in the template
+# =========================================================================
+: "${HIVE_WAREHOUSE_PATH:=/opt/hive/data/warehouse}"
+export HIVE_WAREHOUSE_PATH
+
+envsubst < $HIVE_HOME/conf/core-site.xml.template >
$HIVE_HOME/conf/core-site.xml
+envsubst < $HIVE_HOME/conf/hive-site.xml.template >
$HIVE_HOME/conf/hive-site.xml
+# =========================================================================
+
: "${DB_DRIVER:=derby}"
SKIP_SCHEMA_INIT="${IS_RESUME:-false}"
diff --git a/standalone-metastore/packaging/src/docker/Dockerfile
b/standalone-metastore/packaging/src/docker/Dockerfile
index f9a6ff3576a..fae657d917c 100644
--- a/standalone-metastore/packaging/src/docker/Dockerfile
+++ b/standalone-metastore/packaging/src/docker/Dockerfile
@@ -42,22 +42,47 @@ RUN echo ${BUILD_ENV}
ARG HADOOP_VERSION
ARG HIVE_VERSION
-RUN tar -xzvf /opt/hadoop-$HADOOP_VERSION.tar.gz -C /opt/ && \
- rm -rf /opt/hadoop-$HADOOP_VERSION/share/doc/* && \
- tar -xzvf /opt/hive-standalone-metastore-$HIVE_VERSION-bin.tar.gz -C /opt/
+RUN apt-get update && \
+ apt-get install -y wget
+
+RUN tar -xzv \
+ --exclude="hadoop-$HADOOP_VERSION/lib/native" \
+ --exclude="hadoop-$HADOOP_VERSION/share/doc" \
+ --exclude="hadoop-$HADOOP_VERSION/share/hadoop/client" \
+ --exclude="hadoop-$HADOOP_VERSION/share/hadoop/tools" \
+ --exclude="hadoop-$HADOOP_VERSION/share/hadoop/yarn/*" \
+ --exclude="*/jdiff" \
+ --exclude="*/sources" \
+ --exclude="*tests.jar" \
+ --exclude="*/webapps" \
+ -f /opt/hadoop-$HADOOP_VERSION.tar.gz \
+ -C /opt/ && \
+ \
+ find /opt/hadoop-$HADOOP_VERSION/share/hadoop/common/lib \
+ \( -name "jetty-*.jar" -o -name "zookeeper-*.jar" -o -name "netty-*.jar"
\) \
+ -delete && \
+ # Extract hadoop jars only
+ tar -xzv \
+ -f /opt/hadoop-$HADOOP_VERSION.tar.gz \
+ -C /opt/ \
+ --wildcards "hadoop-$HADOOP_VERSION/share/hadoop/tools/lib/hadoop-*.jar"
&& \
+ # INSTALL HIVE
+ tar -xzv \
+ -f /opt/hive-standalone-metastore-$HIVE_VERSION-bin.tar.gz \
+ -C /opt/
FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run
+ARG UID=1000
ARG HADOOP_VERSION
ARG HIVE_VERSION
-COPY --from=env /opt/hadoop-$HADOOP_VERSION /opt/hadoop
-COPY --from=env /opt/apache-hive-metastore-$HIVE_VERSION-bin /opt/hive
# Install dependencies
RUN set -ex; \
microdnf update -y; \
- microdnf -y install procps; \
- rm -rf /var/lib/apt/lists/*
+ microdnf -y install procps gettext; \
+ microdnf clean all; \
+ useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive
# Set necessary environment variables.
ENV HADOOP_HOME=/opt/hadoop \
@@ -66,20 +91,18 @@ ENV HADOOP_HOME=/opt/hadoop \
ENV PATH=$HIVE_HOME/bin:$HADOOP_HOME/bin:$PATH
-COPY entrypoint.sh /
-COPY conf $HIVE_HOME/conf
-RUN chmod +x /entrypoint.sh
+COPY --from=env --chown=hive /opt/hadoop-$HADOOP_VERSION $HADOOP_HOME
+COPY --from=env --chown=hive /opt/apache-hive-metastore-$HIVE_VERSION-bin
$HIVE_HOME
+COPY --chown=hive entrypoint.sh /
+COPY --chown=hive conf $HIVE_HOME/conf
-ARG UID=1000
-RUN useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive && \
- chown hive /opt/hive && \
- chown hive /opt/hadoop && \
- chown hive /opt/hive/conf && \
- mkdir -p /opt/hive/data/warehouse && \
- chown hive /opt/hive/data/warehouse
+RUN chmod +x /entrypoint.sh && \
+ mkdir -p $HIVE_HOME/data/warehouse && \
+ chown hive $HIVE_HOME/data/warehouse
USER hive
-WORKDIR /opt/hive
+WORKDIR $HIVE_HOME
EXPOSE 9001 9083
+
ENTRYPOINT ["sh", "-c", "/entrypoint.sh"]
diff --git a/standalone-metastore/packaging/src/docker/README.md
b/standalone-metastore/packaging/src/docker/README.md
index eacbdc7edea..e4be92f4930 100644
--- a/standalone-metastore/packaging/src/docker/README.md
+++ b/standalone-metastore/packaging/src/docker/README.md
@@ -84,30 +84,30 @@ or assuming that you're relying on current
`project.version` from pom.xml,
```shell
export HIVE_VERSION=$(mvn -f pom.xml -q help:evaluate
-Dexpression=project.version -DforceStdout)
```
-- Metastore
+#### Metastore
For a quick start, launch the Metastore with Derby,
- ```shell
- docker run -d -p 9083:9083 --name metastore-standalone
apache/hive:standalone-metastore-${HIVE_VERSION}
- ```
- Everything would be lost when the service is down. In order to save the Hive
table's schema and data, start the container with an external Postgres and
Volume to keep them,
-
- ```shell
- docker run -d -p 9083:9083 --env DB_DRIVER=postgres \
- --env
SERVICE_OPTS="-Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
-Djavax.jdo.option.ConnectionUserName=hive
-Djavax.jdo.option.ConnectionPassword=password" \
- --mount source=warehouse,target=/opt/hive/data/warehouse \
- --mount type=bind,source=`mvn help:evaluate
-Dexpression=settings.localRepository -q
-DforceStdout`/org/postgresql/postgresql/42.7.3/postgresql-42.7.3.jar,target=/opt/hive/lib/postgres.jar
\
- --name metastore-standalone
apache/hive:standalone-metastore-${HIVE_VERSION}
- ```
-
- If you want to use your own `hdfs-site.xml` for the service, you can provide
the environment variable `HIVE_CUSTOM_CONF_DIR` for the command. For instance,
put the custom configuration file under the directory `/opt/hive/conf`, then
run,
-
- ```shell
- docker run -d -p 9083:9083 --env DB_DRIVER=postgres \
- -v /opt/hive/conf:/hive_custom_conf --env
HIVE_CUSTOM_CONF_DIR=/hive_custom_conf \
- --mount type=bind,source=`mvn help:evaluate
-Dexpression=settings.localRepository -q
-DforceStdout`/org/postgresql/postgresql/42.7.3/postgresql-42.7.3.jar,target=/opt/hive/lib/postgres.jar
\
- --name metastore apache/hive:standalone-metastore-${HIVE_VERSION}
- ```
+```shell
+docker run -d -p 9083:9083 --name metastore-standalone
apache/hive:standalone-metastore-${HIVE_VERSION}
+```
+Everything would be lost when the service is down. In order to save the Hive
table's schema and data, start the container with an external Postgres and
Volume to keep them,
+
+```shell
+docker run -d -p 9083:9083 --env DB_DRIVER=postgres \
+ --env
SERVICE_OPTS="-Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
-Djavax.jdo.option.ConnectionUserName=hive
-Djavax.jdo.option.ConnectionPassword=password" \
+ --mount source=warehouse,target=/opt/hive/data/warehouse \
+ --mount type=bind,source=`mvn help:evaluate
-Dexpression=settings.localRepository -q
-DforceStdout`/org/postgresql/postgresql/42.7.3/postgresql-42.7.3.jar,target=/opt/hive/lib/postgres.jar
\
+ --name metastore-standalone
apache/hive:standalone-metastore-${HIVE_VERSION}
+```
+
+If you want to use your own `hdfs-site.xml` for the service, you can provide
the environment variable `HIVE_CUSTOM_CONF_DIR` for the command. For instance,
put the custom configuration file under the directory `/opt/hive/conf`, then
run,
+
+```shell
+docker run -d -p 9083:9083 --env DB_DRIVER=postgres \
+ -v /opt/hive/conf:/hive_custom_conf --env
HIVE_CUSTOM_CONF_DIR=/hive_custom_conf \
+ --mount type=bind,source=`mvn help:evaluate
-Dexpression=settings.localRepository -q
-DforceStdout`/org/postgresql/postgresql/42.7.3/postgresql-42.7.3.jar,target=/opt/hive/lib/postgres.jar
\
+ --name metastore apache/hive:standalone-metastore-${HIVE_VERSION}
+```
NOTE:
@@ -116,7 +116,7 @@ then add "--env SCHEMA_COMMAND=upgradeSchema" to the
command.
2) If the full Acid support (Compaction) is needed, use the Hive docker image
to bring up the container.
-- Metastore with Postgres
+#### Metastore with Postgres
To spin up Metastore with a remote DB, there is a `docker-compose.yml` placed
under `packaging/src/docker` for this purpose,
specify the `POSTGRES_LOCAL_PATH` first:
@@ -131,9 +131,29 @@ export POSTGRES_LOCAL_PATH=`mvn help:evaluate
-Dexpression=settings.localReposit
If you don't install maven or have problem in resolving the postgres driver,
you can always download this jar yourself,
change the `POSTGRES_LOCAL_PATH` to the path of the downloaded jar.
+#### Metastore with S3-backed warehouse storage
+
+1. Download the AWS SDK bundle and place it under jars/ directory.
+
+**Disclaimer:**
+Hadoop **3.4.1** requires **AWS SDK v2**.
+```shell
+wget
https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.26.19/bundle-2.26.19.jar
-P jars/
+```
+
+2. Set the following environment variables:
+- AWS_ACCESS_KEY_ID
+- AWS_SECRET_ACCESS_KEY
+- DEFAULT_FS
+- HIVE_WAREHOUSE_PATH
+- S3_ENDPOINT_URL
+
Then,
```shell
-docker compose up -d
+DEFAULT_FS="s3a://dw-team-bucket" \
+HIVE_WAREHOUSE_PATH="/data/warehouse/tablespace/managed/hive" \
+S3_ENDPOINT_URL="s3.us-west-2.amazonaws.com" \
+docker-compose up
```
Metastore and Postgres services will be started as a consequence.
diff --git a/standalone-metastore/packaging/src/docker/conf/metastore-site.xml
b/standalone-metastore/packaging/src/docker/conf/core-site.xml.template
similarity index 73%
copy from standalone-metastore/packaging/src/docker/conf/metastore-site.xml
copy to standalone-metastore/packaging/src/docker/conf/core-site.xml.template
index bb2aec2a931..f15441157b5 100644
--- a/standalone-metastore/packaging/src/docker/conf/metastore-site.xml
+++ b/standalone-metastore/packaging/src/docker/conf/core-site.xml.template
@@ -17,19 +17,15 @@
-->
<configuration>
<property>
- <name>metastore.warehouse.dir</name>
- <value>/opt/hive/data/warehouse</value>
+ <name>fs.s3a.endpoint</name>
+ <value>${S3_ENDPOINT_URL}</value>
</property>
<property>
- <name>metastore.event.db.notification.api.auth</name>
- <value>false</value>
+ <name>fs.s3a.access.key</name>
+ <value>${AWS_ACCESS_KEY_ID}</value>
</property>
<property>
- <name>metastore.catalog.servlet.port</name>
- <value>9001</value>
- </property>
- <property>
- <name>metastore.catalog.servlet.auth</name>
- <value>none</value>
+ <name>fs.s3a.secret.key</name>
+ <value>${AWS_SECRET_ACCESS_KEY}</value>
</property>
</configuration>
diff --git a/standalone-metastore/packaging/src/docker/conf/metastore-site.xml
b/standalone-metastore/packaging/src/docker/conf/metastore-site.xml.template
similarity index 95%
rename from standalone-metastore/packaging/src/docker/conf/metastore-site.xml
rename to
standalone-metastore/packaging/src/docker/conf/metastore-site.xml.template
index bb2aec2a931..d69f05ba47c 100644
--- a/standalone-metastore/packaging/src/docker/conf/metastore-site.xml
+++ b/standalone-metastore/packaging/src/docker/conf/metastore-site.xml.template
@@ -18,7 +18,7 @@
<configuration>
<property>
<name>metastore.warehouse.dir</name>
- <value>/opt/hive/data/warehouse</value>
+ <value>${DEFAULT_FS}${HIVE_WAREHOUSE_PATH}</value>
</property>
<property>
<name>metastore.event.db.notification.api.auth</name>
diff --git a/standalone-metastore/packaging/src/docker/docker-compose.yml
b/standalone-metastore/packaging/src/docker/docker-compose.yml
index 3440f7007ec..852953c0295 100644
--- a/standalone-metastore/packaging/src/docker/docker-compose.yml
+++ b/standalone-metastore/packaging/src/docker/docker-compose.yml
@@ -40,12 +40,21 @@ services:
container_name: metastore
hostname: metastore
environment:
+ DEFAULT_FS: "${DEFAULT_FS}"
+ HIVE_WAREHOUSE_PATH: "${HIVE_WAREHOUSE_PATH}"
+ HADOOP_CLASSPATH: /opt/hadoop/share/hadoop/tools/lib/*
DB_DRIVER: postgres
SERVICE_NAME: 'metastore'
- SERVICE_OPTS: '-Xmx1G
-Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
-
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
- -Djavax.jdo.option.ConnectionUserName=hive
- -Djavax.jdo.option.ConnectionPassword=password'
+ SERVICE_OPTS: >
+ -Xmx1G
+ -Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
+
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
+ -Djavax.jdo.option.ConnectionUserName=hive
+ -Djavax.jdo.option.ConnectionPassword=password
+
+ S3_ENDPOINT_URL: "${S3_ENDPOINT_URL}"
+ AWS_ACCESS_KEY_ID: "${AWS_ACCESS_KEY_ID}"
+ AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}"
ports:
- '9001:9001'
- '9083:9083'
@@ -54,6 +63,8 @@ services:
- type: bind
source: ${POSTGRES_LOCAL_PATH}
target: /opt/hive/lib/postgres.jar
+ # Mount local jars to a temporary staging area (Read-Only)
+ - ./jars:/tmp/ext-jars:ro
networks:
- hive
diff --git a/standalone-metastore/packaging/src/docker/entrypoint.sh
b/standalone-metastore/packaging/src/docker/entrypoint.sh
index 6ce038904d4..ee72357f9e4 100644
--- a/standalone-metastore/packaging/src/docker/entrypoint.sh
+++ b/standalone-metastore/packaging/src/docker/entrypoint.sh
@@ -19,6 +19,31 @@
set -x
+# =========================================================================
+# DYNAMIC JAR LOADER (AWS/S3 Support)
+# =========================================================================
+STAGING_DIR="/tmp/ext-jars"
+
+# Checks if /tmp/ext-jars is mounted (via Docker volume).
+if [ -d "$STAGING_DIR" ]; then
+ if ls "$STAGING_DIR"/*.jar 1> /dev/null 2>&1; then
+ echo "--> Copying custom jars from volume to Hive..."
+ cp -vf "$STAGING_DIR"/*.jar "${HIVE_HOME}/lib/"
+ else
+ echo "--> Volume mounted at $STAGING_DIR, but no jars found."
+ fi
+fi
+
+# =========================================================================
+# REPLACE ${VARS} in the template
+# =========================================================================
+: "${HIVE_WAREHOUSE_PATH:=/opt/hive/data/warehouse}"
+export HIVE_WAREHOUSE_PATH
+
+envsubst < $HIVE_HOME/conf/core-site.xml.template >
$HIVE_HOME/conf/core-site.xml
+envsubst < $HIVE_HOME/conf/metastore-site.xml.template >
$HIVE_HOME/conf/metastore-site.xml
+# =========================================================================
+
: "${DB_DRIVER:=derby}"
SKIP_SCHEMA_INIT="${IS_RESUME:-false}"
@@ -52,4 +77,4 @@ if [[ "${SKIP_SCHEMA_INIT}" == "false" ]]; then
fi
export METASTORE_PORT=${METASTORE_PORT:-9083}
-exec "$HIVE_HOME/bin/start-metastore"
+exec "$HIVE_HOME/bin/start-metastore"