This is an automated email from the ASF dual-hosted git repository.
lesun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new c002a1b [GOBBLIN-1317] Update Docker integration guide. Add Kafka,
HDFS recipes for docker
c002a1b is described below
commit c002a1baf73b9c23541b22714c375dbbd8a3dd43
Author: Hanghang Liu <[email protected]>
AuthorDate: Mon Dec 14 10:07:29 2020 -0800
[GOBBLIN-1317] Update Docker integration guide. Add Kafka, HDFS recipes for
docker
Add Apache License
refine docker integration doc. correct the indent
in docker-compose
update docker image repo. Add a simple GaaS
section
fix typo
fix GaaS docker image link
add one newline at the end
add restart on failure for kafka
Closes #3154 from hanghangliu/gobblin-1317-add-
docker-recipes-documentations
---
.../gobblin-recipes/kafka-hdfs/docker-compose.yml | 128 +++++++++++++++++++++
.../gobblin-recipes/kafka-hdfs/hadoop.env | 60 ++++++++++
gobblin-docs/user-guide/Docker-Integration.md | 106 ++++++++++-------
gobblin-example/src/main/resources/kafka-hdfs.pull | 50 ++++++++
4 files changed, 304 insertions(+), 40 deletions(-)
diff --git a/gobblin-docker/gobblin-recipes/kafka-hdfs/docker-compose.yml
b/gobblin-docker/gobblin-recipes/kafka-hdfs/docker-compose.yml
new file mode 100644
index 0000000..2b6d752
--- /dev/null
+++ b/gobblin-docker/gobblin-recipes/kafka-hdfs/docker-compose.yml
@@ -0,0 +1,128 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+version: '3'
+services:
+ gobblin-standalone:
+ image: apache/gobblin:latest
+ volumes:
+ - "${LOCAL_JOB_DIR}:/tmp/gobblin-standalone/jobs"
+ zookeeper:
+ image: wurstmeister/zookeeper
+ ports:
+ - "2181:2181"
+ kafka:
+ image: wurstmeister/kafka
+ restart: on-failure
+ ports:
+ - "9092:9092"
+ environment:
+ KAFKA_ADVERTISED_HOST_NAME: "kafka"
+ KAFKA_ADVERTISED_PORT: "9092"
+ KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
+ KAFKA_CREATE_TOPICS: "test:1:1"
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock
+
+ namenode:
+ image: bde2020/hadoop-namenode:1.1.0-hadoop2.7.1-java8
+ container_name: namenode
+ volumes:
+ - hadoop_namenode:/hadoop/dfs/name
+ environment:
+ - CLUSTER_NAME=test_cluster
+ env_file:
+ - ./hadoop.env
+ ports:
+ - "9870:9870"
+
+ resourcemanager:
+ image: bde2020/hadoop-resourcemanager:1.1.0-hadoop2.7.1-java8
+ container_name: resourcemanager
+ restart: on-failure
+ depends_on:
+ - namenode
+ - datanode1
+ - datanode2
+ - datanode3
+ env_file:
+ - ./hadoop.env
+ ports:
+ - "8089:8088"
+
+ historyserver:
+ image: bde2020/hadoop-historyserver:1.1.0-hadoop2.7.1-java8
+ container_name: historyserver
+ depends_on:
+ - namenode
+ - datanode1
+ - datanode2
+ volumes:
+ - hadoop_historyserver:/hadoop/yarn/timeline
+ env_file:
+ - ./hadoop.env
+ ports:
+ - "8188:8188"
+
+ nodemanager1:
+ image: bde2020/hadoop-nodemanager:1.1.0-hadoop2.7.1-java8
+ container_name: nodemanager1
+ depends_on:
+ - namenode
+ - datanode1
+ - datanode2
+ env_file:
+ - ./hadoop.env
+ ports:
+ - "8042:8042"
+
+ datanode1:
+ image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
+ container_name: datanode1
+ depends_on:
+ - namenode
+ volumes:
+ - hadoop_datanode1:/hadoop/dfs/data
+ env_file:
+ - ./hadoop.env
+
+ datanode2:
+ image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
+ container_name: datanode2
+ depends_on:
+ - namenode
+ volumes:
+ - hadoop_datanode2:/hadoop/dfs/data
+ env_file:
+ - ./hadoop.env
+
+ datanode3:
+ image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
+ container_name: datanode3
+ depends_on:
+ - namenode
+ volumes:
+ - hadoop_datanode3:/hadoop/dfs/data
+ env_file:
+ - ./hadoop.env
+
+volumes:
+ hadoop_namenode:
+ hadoop_datanode1:
+ hadoop_datanode2:
+ hadoop_datanode3:
+ hadoop_historyserver:
diff --git a/gobblin-docker/gobblin-recipes/kafka-hdfs/hadoop.env
b/gobblin-docker/gobblin-recipes/kafka-hdfs/hadoop.env
new file mode 100644
index 0000000..efed6fc
--- /dev/null
+++ b/gobblin-docker/gobblin-recipes/kafka-hdfs/hadoop.env
@@ -0,0 +1,60 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+CORE_CONF_fs_defaultFS=hdfs://namenode:9000
+CORE_CONF_hadoop_http_staticuser_user=root
+CORE_CONF_hadoop_proxyuser_hue_hosts=*
+CORE_CONF_hadoop_proxyuser_hue_groups=*
+CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec
+
+HDFS_CONF_dfs_webhdfs_enabled=true
+HDFS_CONF_dfs_permissions_enabled=false
+HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
+
+YARN_CONF_yarn_log___aggregation___enable=true
+YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
+YARN_CONF_yarn_resourcemanager_recovery_enabled=true
+YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
+YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
+YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192
+YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4
+YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
+YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
+YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
+YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
+YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
+YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
+YARN_CONF_yarn_timeline___service_enabled=true
+YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
+YARN_CONF_yarn_timeline___service_hostname=historyserver
+YARN_CONF_mapreduce_map_output_compress=true
+YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec
+YARN_CONF_yarn_nodemanager_resource_memory___mb=16384
+YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8
+YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5
+YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
+YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
+
+MAPRED_CONF_mapreduce_framework_name=yarn
+MAPRED_CONF_mapred_child_java_opts=-Xmx4096m
+MAPRED_CONF_mapreduce_map_memory_mb=4096
+MAPRED_CONF_mapreduce_reduce_memory_mb=8192
+MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m
+MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m
+MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
+MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
+MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
diff --git a/gobblin-docs/user-guide/Docker-Integration.md
b/gobblin-docs/user-guide/Docker-Integration.md
index 2ec4c02..f7989c1 100644
--- a/gobblin-docs/user-guide/Docker-Integration.md
+++ b/gobblin-docs/user-guide/Docker-Integration.md
@@ -12,74 +12,100 @@ For more information on Docker, including how to install
it, check out the docum
# Docker Repositories
-Gobblin currently has four different repositories, and all are on Docker Hub
[here](https://hub.docker.com/u/gobblin/).
+Gobblin currently has four different repositories, and all are on Docker Hub
[here](https://hub.docker.com/u/gobblin/). We are also starting to use
[Apache's
repository](https://hub.docker.com/r/apache/gobblin/tags?page=1&ordering=last_updated)
for our images.
The `gobblin/gobblin-wikipedia` repository contains images that run the
Gobblin Wikipedia job found in the [getting started guide](../Getting-Started).
These images are useful for users new to Docker or Gobblin, they primarily act
as a "Hello World" example for the Gobblin Docker integration.
The `gobblin/gobblin-standalone` repository contains images that run a
[Gobblin standalone service](Gobblin-Deployment#standalone-architecture) inside
a Docker container. These images provide an easy and simple way to setup a
Gobblin standalone service on any Docker compatible machine.
+The `gobblin/gobblin-service` repository contains images that run [Gobblin as
a
service](Building-Gobblin-as-a-Service#running-gobblin-as-a-service-with-docker),
which is a service that takes in a user request (a logical flow) and converts
it into a series of Gobblin Jobs, and monitors these jobs in a distributed
manner.
+
The `gobblin/gobblin-base` and `gobblin/gobblin-distributions` repositories
are for internal use only, and are primarily useful for Gobblin developers.
-## Gobblin-Wikipedia Repository
+# Run Gobblin Standalone
-The Docker images for this repository can be found on Docker Hub
[here](https://hub.docker.com/r/gobblin/gobblin-wikipedia/). These images are
mainly meant to act as a "Hello World" example for the Gobblin-Docker
integration, and to provide a sanity check to see if the Gobblin-Docker
integration is working on a given machine. The image contains the Gobblin
configuration files to run the [Gobblin Wikipedia job](../Getting-Started).
When a container is launched using the `gobblin-wikipedia [...]
+The Docker images for this repository can be found on Docker Hub
[here](https://hub.docker.com/r/gobblin/gobblin-standalone/). These images run
a Gobblin standalone service inside a Docker container. The Gobblin standalone
service is a long running process that can run Gobblin jobs defined in a `.job`
or `.pull` file. The job / pull files are submitted to the standalone service
by placing them in a directory on the local filesystem. The standalone service
monitors this directory for any [...]
-Running the `gobblin-wikipedia` image requires taking following steps (lets
assume we want to an Ubuntu based image):
+### Set working directory
-* Download the images from the `gobblin/gobblin-wikipedia` repository
+Before running docker containers, set a working directory for Gobblin jobs:
-```
-docker pull gobblin/gobblin-wikipedia:ubuntu-gobblin-latest
-```
+`export LOCAL_JOB_DIR=<local_gobblin_directory>`
-* Run the `gobblin/gobblin-wikipedia:ubuntu-gobblin-latest` image in a Docker
container
+We will use this directory as the
[volume](https://docs.docker.com/storage/volumes/) for Gobblin jobs and
outputs. Make sure your Docker has the
[access](https://docs.docker.com/docker-for-mac/#file-sharing) to this folder.
This is the prerequisite for all following example jobs.
-```
-docker run gobblin/gobblin-wikipedia:ubuntu-gobblin-latest
-```
+### Run the docker image with simple wikipedia jobs
-The logs are printed to the console, and no errors should pop up. This should
provide a nice sanity check to ensure that everything is working as expected.
The output of the job will be written to a directory inside the container. When
the container exits that data will be lost. In order to preserve the output of
the job, continue to the next step.
+Run these commands to start the docker image:
-* Preserving the output of a Docker container requires using a [data
volume](https://docs.docker.com/engine/tutorials/dockervolumes/). To do this,
run the below command:
+`docker pull apache/gobblin:latest`
-```
-docker run -v /home/gobblin/work-dir:/home/gobblin/work-dir gobblin-wikipedia
-```
+`docker run -v $LOCAL_JOB_DIR:/tmp/gobblin-standalone/jobs
apache/gobblin:latest`
-The output of the Gobblin-Wikipedia job should now be written to
`/home/gobblin/work-dir/job-output`. The `-v` command in Docker uses a feature
of Docker called [data
volumes](https://docs.docker.com/engine/tutorials/dockervolumes/). The `-v`
option mounts a host directory into a container and is of the form
`[host-directory]:[container-directory]`. Now any modifications to the host
directory can be seen inside the container-directory, and any modifications to
the container-directory can [...]
+After the container spins up, put the
[wikipedia.pull](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull)
in ${LOCAL_JOB_DIR}. You will see the Gobblin daemon pick up the job, and the
result output is in ${LOCAL_JOB_DIR}/job-output/.
-## Gobblin-Standalone Repository
+This example job is correspondent to the [getting started
guide](https://gobblin.readthedocs.io/en/latest/Getting-Started/). With the
docker image, you can focus on the Gobblin functionalities, avoiding the hassle
of building a distribution.
-The Docker images for this repository can be found on Docker Hub
[here](https://hub.docker.com/r/gobblin/gobblin-standalone/). These images run
a Gobblin standalone service inside a Docker container. The Gobblin standalone
service is a long running process that can run Gobblin jobs defined in a `.job`
or `.pull` file. The job / pull files are submitted to the standalone service
by placing them in a directory on the local filesystem. The standalone service
monitors this directory for any [...]
+### Use Gobblin Standalone on Docker for Kafka and HDFS Ingestion
+
+* To ingest from/to Kafka and HDFS by Gobblin, you need to start services for
Zookeeper, Kafka and HDFS along with Gobblin. We use docker
[compose](https://docs.docker.com/compose/) with images contributed to docker
hub. Firstly, you need to create a
[docker-compose.yml](https://github.com/apache/incubator-gobblin/blob/master/gobblin-docker/gobblin-recipes/kafka-hdfs/docker-compose.yml)
file.
+
+* Second, in the same folder of the yml file, create a
[hadoop.env](https://github.com/apache/incubator-gobblin/blob/master/gobblin-docker/gobblin-recipes/kafka-hdfs/hadoop.env)
file to specify all HDFS related config(copy the content into your .env file).
+
+* Open a terminal in the same folder, pull and run these docker services:
+
+ `docker-compose -f ./docker-compose.yml pull`
+
+ `docker-compose -f ./docker-compose.yml up`
+
+ Here we expose Zookeeper at port 2128, Kafka at 9092 with an auto created
Kafka topic “test”. All hadoop related configs are stated in the .env file.
+
+* You should see all services running. Now we can push some events into the
Kafka topic. Open a terminal from [docker
desktop](https://docs.docker.com/desktop/dashboard/) dashboard or [docker
exec](https://docs.docker.com/engine/reference/commandline/exec/) to interact
with Kafka. Inside the Kafka container terminal:
+
+ `cd /opt/kafka`
+
+ `./bin/kafka-console-producer.sh --broker-list kafka:9092 --topic test`
+
+ You can type messages for the topic “test”, and press ctrl+c to exit.
+
+* Put the
[kafka-hdfs.pull](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/kafka-hdfs.pull)
in ${LOCAL_JOB_DIR}, so that the Gobblin daemon will pick up this job and
write the result to HDFS. You will see the Gobblin daemon pick up the job.
+
+After the job finished, open a terminal in the HDFS namenode container:
+
+`hadoop fs -ls /gobblintest/job-output/test/`
+
+You will see the result file in this HDFS folder. You can use this command to
verify the content in the text file:
+
+`hadoop fs -cat /gobblintest/job-output/test/<output_file.txt>`
+
+# Run Gobblin as a Service
+
+The goal of GaaS(Gobblin as a Service) is to enable a self service so that
different users can automatically provision and execute various supported
Gobblin applications limiting the need for development and operation teams to
be involved during the provisioning process. You can take a look at our [design
detail](https://cwiki.apache.org/confluence/display/GOBBLIN/Gobblin+as+a+Service).
+
+### Set working directory
+
+Similar to standalone working directory settings:
-Running the `gobblin-standalone` image requires taking the following steps:
+`export GAAS_JOB_DIR=<gaas_gobblin_directory>`
-* Download the images from the `gobblin/gobblin-standalone` repository
+`export LOCAL_DATAPACK_DIR=<local_directory_of_templateUris>`
-```
-docker pull gobblin/gobblin-standalone:ubuntu-gobblin-latest
-```
+### Start Gobblin as a Service
-* Run the `gobblin/gobblin-standalone:ubuntu-gobblin-latest` image in a Docker
container
+Run these commands to start the docker image:
-```
-docker run -v /home/gobblin/conf:/etc/opt/job-conf \
- -v /home/gobblin/work-dir:/home/gobblin/work-dir \
- -v /home/gobblin/logs:/var/log/gobblin \
- gobblin/gobblin-standalone:ubuntu-gobblin-latest
-```
+`docker pull gobblin/gobblin-service:alpine-gaas-latest`
-A data volume needs to be created for the job configuration directory
(contains all the job configuration files), the work directory (contains all
the job output data), and the logs directory (contains all the Gobblin
standalone logs).
+`docker run -p 6956:6956 -v GAAS_JOB_DIR:/tmp/gobblin-as-service/jobs -v
LOCAL_DATAPACK_DIR:/tmp/templateCatalog
gobblin/gobblin-service:alpine-gaas-latest`
-The `-v /home/gobblin/conf:/etc/opt/job-conf` option allows any new job / pull
files added to the `/home/gobblin/conf` directory on the host filesystem will
be seen by the Gobblin standalone service inside the container. So any job /
pull added to the `/home/gobblin/conf` directory on the local filesystem will
be run by the Gobblin standalone inside running inside the Docker container.
Note the container directory (`/etc/opt/job-conf`) should not be modified,
while the host directory (`/ [...]
+The GaaS will be started, and the service can now be accessed on
localhost:6956.
-The `-v /home/gobblin/work-dir:/home/gobblin/work-dir` option allows the
container to write data to the host filesystem, so that the data persists after
the container is shutdown. Once again, the container directory
(`/home/gobblin/work-dir`) should not be modified, while the host directory
(`/home/gobblin/work-dir`) can be any directory on the host filesystem.
+### Interact with GaaS
-The `-v /home/gobblin/logs:/var/log/gobblin` option allows the Gobblin
standalone logs to be written to the host filesystem, so that they can be read
on the host machine. This is useful for monitoring and debugging purposes. Once
again, the container directory (`/var/log/gobblin`) directory should not be
modified, while the container directory (`/home/gobblin/logs`) can be any
directory on the host filesystem.
+##### TODO: Add an end-to-end workflow example in GaaS.
# Future Work
-* Create `gobblin-dev` images that provide an development environment for
Gobblin contributors
-* Create `gobblin-kafka` images that provide an end-to-end service for writing
to Kafka and ingesting the Kafka data through Gobblin
-* Test and write a tutorial on using `gobblin-standalone` images to write to a
HDFS cluster
-* Create images based on [Linux Alpine](https://hub.docker.com/_/alpine/)
(lightweight Linux distro)
+* Complete `gobblin-service` docker guidance that serve as a quick-start for
GaaS user
+* Implement a simple converter and inject into the docker service. Create a
corresponding doc to guide users implement their own logic but no need to
tangle with the Gobblin codebase
+* Finish the Github action to automate the docker build
diff --git a/gobblin-example/src/main/resources/kafka-hdfs.pull
b/gobblin-example/src/main/resources/kafka-hdfs.pull
new file mode 100644
index 0000000..8883ca5
--- /dev/null
+++ b/gobblin-example/src/main/resources/kafka-hdfs.pull
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+job.name=GobblinKafkaHDFSQuickStart
+job.group=GobblinKafka
+job.description=Gobblin quick start job for Kafka
+job.lock.enabled=false
+
+kafka.brokers=kafka:9092
+
+source.class=org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleSource
+extract.namespace=org.apache.gobblin.extract.kafka
+
+writer.builder.class=org.apache.gobblin.writer.SimpleDataWriterBuilder
+writer.file.path.type=tablename
+writer.destination.type=HDFS
+writer.output.format=txt
+
+data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
+
+mr.job.max.mappers=1
+
+metrics.reporting.file.enabled=true
+metrics.log.dir=/gobblin-kafka/metrics
+metrics.reporting.file.suffix=txt
+
+bootstrap.with.offset=earliest
+
+fs.uri=hdfs://namenode:9000
+writer.fs.uri=hdfs://namenode:9000
+state.store.fs.uri=hdfs://namenode:9000
+
+mr.job.root.dir=/gobblin-kafka/working
+state.store.dir=/gobblin-kafka/state-store
+task.data.root.dir=/jobs/kafkaetl/gobblin/gobblin-kafka/task-data
+data.publisher.final.dir=/gobblintest/job-output