Repository: incubator-griffin-site Updated Branches: refs/heads/asf-site 7af1690fb -> 47c77c9b6
Updated asf-site site from master (4d98ade756427a2df8b9b1695c9bae31c0974780) Project: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/commit/47c77c9b Tree: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/tree/47c77c9b Diff: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/diff/47c77c9b Branch: refs/heads/asf-site Commit: 47c77c9b676490dbb0345b5f9f9e644e016ca3f9 Parents: 7af1690 Author: William Guo <gu...@apache.org> Authored: Thu Sep 13 22:35:25 2018 +0800 Committer: William Guo <gu...@apache.org> Committed: Thu Sep 13 22:35:25 2018 +0800 ---------------------------------------------------------------------- data/create-table.hql | 27 ------------------- data/gen_delta_src.sh | 12 --------- data/gen_demo_data.sh | 14 ---------- data/gen_hive_data.sh | 54 -------------------------------------- data/insert-data.hql.template | 2 -- docs/quickstart.html | 10 ++++--- 6 files changed, 7 insertions(+), 112 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/47c77c9b/data/create-table.hql ---------------------------------------------------------------------- diff --git a/data/create-table.hql b/data/create-table.hql deleted file mode 100644 index e117cd6..0000000 --- a/data/create-table.hql +++ /dev/null @@ -1,27 +0,0 @@ ---replace data location with your own path - -CREATE EXTERNAL TABLE `demo_src`( - `id` bigint, - `age` int, - `desc` string) -PARTITIONED BY ( - `dt` string, - `hour` string) -ROW FORMAT DELIMITED - FIELDS TERMINATED BY '|' -LOCATION - 'hdfs:///griffin/data/batch/demo_src'; - ---replace data location with your own path - -CREATE EXTERNAL TABLE `demo_tgt`( - `id` bigint, - `age` int, - `desc` string) -PARTITIONED BY ( - `dt` string, - `hour` string) -ROW FORMAT DELIMITED - FIELDS TERMINATED BY '|' -LOCATION - 'hdfs:///griffin/data/batch/demo_tgt'; http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/47c77c9b/data/gen_delta_src.sh ---------------------------------------------------------------------- diff --git a/data/gen_delta_src.sh b/data/gen_delta_src.sh deleted file mode 100644 index 29fc96b..0000000 --- a/data/gen_delta_src.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -file=delta_src -id=124 - -rm ${file} - -for i in {1..1000} -do - idx=`shuf -i1-2000 -n1` - echo "${id}|${idx}|${idx}" >> ${file} -done http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/47c77c9b/data/gen_demo_data.sh ---------------------------------------------------------------------- diff --git a/data/gen_demo_data.sh b/data/gen_demo_data.sh deleted file mode 100644 index 55a975c..0000000 --- a/data/gen_demo_data.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -./gen_delta_src.sh - -src=demo_src -tgt=demo_tgt - -rm ${src} -cat demo_basic >> ${src} -cat delta_src >> ${src} - -rm ${tgt} -cat demo_basic >> ${tgt} -cat delta_tgt >> ${tgt} http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/47c77c9b/data/gen_hive_data.sh ---------------------------------------------------------------------- diff --git a/data/gen_hive_data.sh b/data/gen_hive_data.sh deleted file mode 100644 index 5d7816d..0000000 --- a/data/gen_hive_data.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash - -#create table -hive -f create-table.hql -echo "create table done" - -#current hour -./gen_demo_data.sh -cur_date=`date +%Y%m%d%H` -dt=${cur_date:0:8} -hour=${cur_date:8:2} -partition_date="dt='$dt',hour='$hour'" -sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql -hive -f insert-data.hql -src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE -tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE -hadoop fs -touchz ${src_done_path} -hadoop fs -touchz ${tgt_done_path} -echo "insert data [$partition_date] done" - -#last hour -./gen_demo_data.sh -cur_date=`date -d '1 hour ago' +%Y%m%d%H` -dt=${cur_date:0:8} -hour=${cur_date:8:2} -partition_date="dt='$dt',hour='$hour'" -sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql -hive -f insert-data.hql -src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE -tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE -hadoop fs -touchz ${src_done_path} -hadoop fs -touchz ${tgt_done_path} -echo "insert data [$partition_date] done" - -#next hours -set +e -while true -do - ./gen_demo_data.sh - cur_date=`date +%Y%m%d%H` - next_date=`date -d "+1hour" '+%Y%m%d%H'` - dt=${next_date:0:8} - hour=${next_date:8:2} - partition_date="dt='$dt',hour='$hour'" - sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql - hive -f insert-data.hql - src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE - tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE - hadoop fs -touchz ${src_done_path} - hadoop fs -touchz ${tgt_done_path} - echo "insert data [$partition_date] done" - sleep 3600 -done -set -e http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/47c77c9b/data/insert-data.hql.template ---------------------------------------------------------------------- diff --git a/data/insert-data.hql.template b/data/insert-data.hql.template deleted file mode 100644 index 4e4039a..0000000 --- a/data/insert-data.hql.template +++ /dev/null @@ -1,2 +0,0 @@ -LOAD DATA LOCAL INPATH 'demo_src' INTO TABLE demo_src PARTITION (PARTITION_DATE); -LOAD DATA LOCAL INPATH 'demo_tgt' INTO TABLE demo_tgt PARTITION (PARTITION_DATE); http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/47c77c9b/docs/quickstart.html ---------------------------------------------------------------------- diff --git a/docs/quickstart.html b/docs/quickstart.html index 1fdc75b..9bfd59f 100644 --- a/docs/quickstart.html +++ b/docs/quickstart.html @@ -129,14 +129,18 @@ under the License. <h2 id="user-story">User Story</h2> <p>Say we have two hive tables(demo_src, demo_tgt), we need to know what is the data quality for target table, based on source table.</p> -<p>For simplicity, suppose both two table have the same schema as this:</p> +<p>For simplicity, suppose both two tables have the same schema as this:</p> <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>id bigint age int desc string dt string hour string </code></pre></div></div> -<p>dt and hour are partitions, as every date we have one big partition dt(like 20180912), for every date we have 24 hour partitions(like 01,02, â¦).</p> +<p>dt and hour are partitions,</p> + +<p>as every date we have one partition dt(like 20180912),</p> + +<p>for every date we have 24 hour partitions(like 01,02, â¦).</p> <h2 id="environment-preparation">Environment Preparation</h2> <p>You need to prepare the environment for Apache Griffin measure module, including the following software:</p> @@ -168,7 +172,7 @@ cd griffin-0.3.0-incubating-source-release <h2 id="data-preparation">Data Preparation</h2> -<p>For our quick start, We will generate two Hive tables demo_src and demo_tgt.</p> +<p>For our quick start, We will generate two hive tables demo_src and demo_tgt.</p> <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>--create hive tables here. hql script --Note: replace hdfs location with your own path CREATE EXTERNAL TABLE `demo_src`(