Repository: falcon Updated Branches: refs/heads/master 1416f5e70 -> f96690006
http://git-wip-us.apache.org/repos/asf/falcon/blob/f9669000/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure-workflow.xml ---------------------------------------------------------------------- diff --git a/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure-workflow.xml b/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure-workflow.xml new file mode 100644 index 0000000..7362c2e --- /dev/null +++ b/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure-workflow.xml @@ -0,0 +1,401 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> +<workflow-app xmlns='uri:oozie:workflow:0.3' name='falcon-dr-hive-workflow'> + <credentials> + <credential name='hive_src_credentials' type='hcat'> + <property> + <name>hcat.metastore.uri</name> + <value>${sourceMetastoreUri}</value> + </property> + <property> + <name>hcat.metastore.principal</name> + <value>${sourceHiveMetastoreKerberosPrincipal}</value> + </property> + </credential> + <credential name='hive_tgt_credentials' type='hcat'> + <property> + <name>hcat.metastore.uri</name> + <value>${targetMetastoreUri}</value> + </property> + <property> + <name>hcat.metastore.principal</name> + <value>${targetHiveMetastoreKerberosPrincipal}</value> + </property> + </credential> + <credential name="hive2_src_credentials" type="hive2"> + <property> + <name>hive2.server.principal</name> + <value>${sourceHive2KerberosPrincipal}</value> + </property> + <property> + <name>hive2.jdbc.url</name> + <value>jdbc:${sourceHiveServer2Uri}/${sourceDatabase}</value> + </property> + </credential> + <credential name="hive2_tgt_credentials" type="hive2"> + <property> + <name>hive2.server.principal</name> + <value>${targetHive2KerberosPrincipal}</value> + </property> + <property> + <name>hive2.jdbc.url</name> + <value>jdbc:${targetHiveServer2Uri}/${sourceDatabase}</value> + </property> + </credential> + </credentials> + <start to='last-event'/> + <action name="last-event" cred="hive_tgt_credentials"> + <java> + <job-tracker>${jobTracker}</job-tracker> + <name-node>${nameNode}</name-node> + <configuration> + <property> <!-- hadoop 2 parameter --> + <name>oozie.launcher.mapreduce.job.user.classpath.first</name> + <value>true</value> + </property> + <property> + <name>mapred.job.queue.name</name> + <value>${queueName}</value> + </property> + <property> + <name>oozie.launcher.mapred.job.priority</name> + <value>${jobPriority}</value> + </property> + <property> + <name>oozie.use.system.libpath</name> + <value>true</value> + </property> + <property> + <name>oozie.action.sharelib.for.java</name> + <value>distcp,hive,hive2,hcatalog</value> + </property> + <property> + <name>oozie.launcher.mapreduce.job.hdfs-servers</name> + <value>${sourceNN},${targetNN}</value> + </property> + <property> + <name>mapreduce.job.hdfs-servers</name> + <value>${sourceNN},${targetNN}</value> + </property> + </configuration> + <main-class>org.apache.falcon.hive.HiveDRTool</main-class> + <arg>-Dmapred.job.queue.name=${queueName}</arg> + <arg>-Dmapred.job.priority=${jobPriority}</arg> + <arg>-falconLibPath</arg> + <arg>${wf:conf("falcon.libpath")}</arg> + <arg>-sourceCluster</arg> + <arg>${sourceCluster}</arg> + <arg>-sourceMetastoreUri</arg> + <arg>${sourceMetastoreUri}</arg> + <arg>-sourceHiveServer2Uri</arg> + <arg>${sourceHiveServer2Uri}</arg> + <arg>-sourceDatabase</arg> + <arg>${sourceDatabase}</arg> + <arg>-sourceTable</arg> + <arg>${sourceTable}</arg> + <arg>-sourceStagingPath</arg> + <arg>${sourceStagingPath}</arg> + <arg>-sourceNN</arg> + <arg>${sourceNN}</arg> + <arg>-sourceNNKerberosPrincipal</arg> + <arg>${sourceNNKerberosPrincipal}</arg> + <arg>-sourceHiveMetastoreKerberosPrincipal</arg> + <arg>${sourceHiveMetastoreKerberosPrincipal}</arg> + <arg>-sourceHive2KerberosPrincipal</arg> + <arg>${sourceHive2KerberosPrincipal}</arg> + <arg>-targetCluster</arg> + <arg>${targetCluster}</arg> + <arg>-targetMetastoreUri</arg> + <arg>${targetMetastoreUri}</arg> + <arg>-targetHiveServer2Uri</arg> + <arg>${targetHiveServer2Uri}</arg> + <arg>-targetStagingPath</arg> + <arg>${targetStagingPath}</arg> + <arg>-targetNN</arg> + <arg>${targetNN}</arg> + <arg>-targetNNKerberosPrincipal</arg> + <arg>${targetNNKerberosPrincipal}</arg> + <arg>-targetHiveMetastoreKerberosPrincipal</arg> + <arg>${targetHiveMetastoreKerberosPrincipal}</arg> + <arg>-targetHive2KerberosPrincipal</arg> + <arg>${targetHive2KerberosPrincipal}</arg> + <arg>-maxEvents</arg> + <arg>${maxEvents}</arg> + <arg>-clusterForJobRun</arg> + <arg>${clusterForJobRun}</arg> + <arg>-clusterForJobRunWriteEP</arg> + <arg>${clusterForJobRunWriteEP}</arg> + <arg>-clusterForJobNNKerberosPrincipal</arg> + <arg>${clusterForJobNNKerberosPrincipal}</arg> + <arg>-drJobName</arg> + <arg>${drJobName}-${nominalTime}</arg> + <arg>-executionStage</arg> + <arg>lastevents</arg> + </java> + <ok to="export-dr-replication"/> + <error to="failure"/> + </action> + <!-- Export Replication action --> + <action name="export-dr-replication" cred="hive_src_credentials,hive2_src_credentials"> + <java> + <job-tracker>${jobTracker}</job-tracker> + <name-node>${nameNode}</name-node> + <configuration> + <property> <!-- hadoop 2 parameter --> + <name>oozie.launcher.mapreduce.job.user.classpath.first</name> + <value>true</value> + </property> + <property> + <name>mapred.job.queue.name</name> + <value>${queueName}</value> + </property> + <property> + <name>oozie.launcher.mapred.job.priority</name> + <value>${jobPriority}</value> + </property> + <property> + <name>oozie.use.system.libpath</name> + <value>true</value> + </property> + <property> + <name>oozie.action.sharelib.for.java</name> + <value>distcp,hive,hive2,hcatalog</value> + </property> + <property> + <name>oozie.launcher.mapreduce.job.hdfs-servers</name> + <value>${sourceNN},${targetNN}</value> + </property> + <property> + <name>mapreduce.job.hdfs-servers</name> + <value>${sourceNN},${targetNN}</value> + </property> + </configuration> + <main-class>org.apache.falcon.hive.HiveDRTool</main-class> + <arg>-Dmapred.job.queue.name=${queueName}</arg> + <arg>-Dmapred.job.priority=${jobPriority}</arg> + <arg>-falconLibPath</arg> + <arg>${wf:conf("falcon.libpath")}</arg> + <arg>-replicationMaxMaps</arg> + <arg>${replicationMaxMaps}</arg> + <arg>-distcpMaxMaps</arg> + <arg>${distcpMaxMaps}</arg> + <arg>-sourceCluster</arg> + <arg>${sourceCluster}</arg> + <arg>-sourceMetastoreUri</arg> + <arg>${sourceMetastoreUri}</arg> + <arg>-sourceHiveServer2Uri</arg> + <arg>${sourceHiveServer2Uri}</arg> + <arg>-sourceDatabase</arg> + <arg>${sourceDatabase}</arg> + <arg>-sourceTable</arg> + <arg>${sourceTable}</arg> + <arg>-sourceStagingPath</arg> + <arg>${sourceStagingPath}</arg> + <arg>-sourceNN</arg> + <arg>${sourceNN}</arg> + <arg>-sourceNNKerberosPrincipal</arg> + <arg>${sourceNNKerberosPrincipal}</arg> + <arg>-sourceHiveMetastoreKerberosPrincipal</arg> + <arg>${sourceHiveMetastoreKerberosPrincipal}</arg> + <arg>-sourceHive2KerberosPrincipal</arg> + <arg>${sourceHive2KerberosPrincipal}</arg> + <arg>-targetCluster</arg> + <arg>${targetCluster}</arg> + <arg>-targetMetastoreUri</arg> + <arg>${targetMetastoreUri}</arg> + <arg>-targetHiveServer2Uri</arg> + <arg>${targetHiveServer2Uri}</arg> + <arg>-targetStagingPath</arg> + <arg>${targetStagingPath}</arg> + <arg>-targetNN</arg> + <arg>${targetNN}</arg> + <arg>-targetNNKerberosPrincipal</arg> + <arg>${targetNNKerberosPrincipal}</arg> + <arg>-targetHiveMetastoreKerberosPrincipal</arg> + <arg>${targetHiveMetastoreKerberosPrincipal}</arg> + <arg>-targetHive2KerberosPrincipal</arg> + <arg>${targetHive2KerberosPrincipal}</arg> + <arg>-maxEvents</arg> + <arg>${maxEvents}</arg> + <arg>-distcpMapBandwidth</arg> + <arg>${distcpMapBandwidth}</arg> + <arg>-clusterForJobRun</arg> + <arg>${clusterForJobRun}</arg> + <arg>-clusterForJobRunWriteEP</arg> + <arg>${clusterForJobRunWriteEP}</arg> + <arg>-clusterForJobNNKerberosPrincipal</arg> + <arg>${clusterForJobNNKerberosPrincipal}</arg> + <arg>-drJobName</arg> + <arg>${drJobName}-${nominalTime}</arg> + <arg>-executionStage</arg> + <arg>export</arg> + </java> + <ok to="import-dr-replication"/> + <error to="failure"/> + </action> + <!-- Import Replication action --> + <action name="import-dr-replication" cred="hive_tgt_credentials,hive2_tgt_credentials"> + <java> + <job-tracker>${jobTracker}</job-tracker> + <name-node>${nameNode}</name-node> + <configuration> + <property> <!-- hadoop 2 parameter --> + <name>oozie.launcher.mapreduce.job.user.classpath.first</name> + <value>true</value> + </property> + <property> + <name>mapred.job.queue.name</name> + <value>${queueName}</value> + </property> + <property> + <name>oozie.launcher.mapred.job.priority</name> + <value>${jobPriority}</value> + </property> + <property> + <name>oozie.use.system.libpath</name> + <value>true</value> + </property> + <property> + <name>oozie.action.sharelib.for.java</name> + <value>distcp,hive,hive2,hcatalog</value> + </property> + <property> + <name>oozie.launcher.mapreduce.job.hdfs-servers</name> + <value>${sourceNN},${targetNN}</value> + </property> + <property> + <name>mapreduce.job.hdfs-servers</name> + <value>${sourceNN},${targetNN}</value> + </property> + </configuration> + <main-class>org.apache.falcon.hive.HiveDRTool</main-class> + <arg>-Dmapred.job.queue.name=${queueName}</arg> + <arg>-Dmapred.job.priority=${jobPriority}</arg> + <arg>-falconLibPath</arg> + <arg>${wf:conf("falcon.libpath")}</arg> + <arg>-replicationMaxMaps</arg> + <arg>${replicationMaxMaps}</arg> + <arg>-distcpMaxMaps</arg> + <arg>${distcpMaxMaps}</arg> + <arg>-sourceCluster</arg> + <arg>${sourceCluster}</arg> + <arg>-sourceMetastoreUri</arg> + <arg>${sourceMetastoreUri}</arg> + <arg>-sourceHiveServer2Uri</arg> + <arg>${sourceHiveServer2Uri}</arg> + <arg>-sourceDatabase</arg> + <arg>${sourceDatabase}</arg> + <arg>-sourceTable</arg> + <arg>${sourceTable}</arg> + <arg>-sourceStagingPath</arg> + <arg>${sourceStagingPath}</arg> + <arg>-sourceNN</arg> + <arg>${sourceNN}</arg> + <arg>-sourceNNKerberosPrincipal</arg> + <arg>${sourceNNKerberosPrincipal}</arg> + <arg>-sourceHiveMetastoreKerberosPrincipal</arg> + <arg>${sourceHiveMetastoreKerberosPrincipal}</arg> + <arg>-sourceHive2KerberosPrincipal</arg> + <arg>${sourceHive2KerberosPrincipal}</arg> + <arg>-targetCluster</arg> + <arg>${targetCluster}</arg> + <arg>-targetMetastoreUri</arg> + <arg>${targetMetastoreUri}</arg> + <arg>-targetHiveServer2Uri</arg> + <arg>${targetHiveServer2Uri}</arg> + <arg>-targetStagingPath</arg> + <arg>${targetStagingPath}</arg> + <arg>-targetNN</arg> + <arg>${targetNN}</arg> + <arg>-targetNNKerberosPrincipal</arg> + <arg>${targetNNKerberosPrincipal}</arg> + <arg>-targetHiveMetastoreKerberosPrincipal</arg> + <arg>${targetHiveMetastoreKerberosPrincipal}</arg> + <arg>-targetHive2KerberosPrincipal</arg> + <arg>${targetHive2KerberosPrincipal}</arg> + <arg>-maxEvents</arg> + <arg>${maxEvents}</arg> + <arg>-distcpMapBandwidth</arg> + <arg>${distcpMapBandwidth}</arg> + <arg>-clusterForJobRun</arg> + <arg>${clusterForJobRun}</arg> + <arg>-clusterForJobRunWriteEP</arg> + <arg>${clusterForJobRunWriteEP}</arg> + <arg>-clusterForJobNNKerberosPrincipal</arg> + <arg>${clusterForJobNNKerberosPrincipal}</arg> + <arg>-drJobName</arg> + <arg>${drJobName}-${nominalTime}</arg> + <arg>-executionStage</arg> + <arg>import</arg> + </java> + <ok to="success"/> + <error to="failure"/> + </action> + <decision name="success"> + <switch> + <case to="successAlert"> + ${drNotificationReceivers ne 'NA'} + </case> + <default to="end"/> + </switch> + </decision> + <decision name="failure"> + <switch> + <case to="failureAlert"> + ${drNotificationReceivers ne 'NA'} + </case> + <default to="fail"/> + </switch> + </decision> + <action name="successAlert"> + <email xmlns="uri:oozie:email-action:0.2"> + <to>${drNotificationReceivers}</to> + <subject>INFO: Hive DR workflow ${drJobName} completed successfully</subject> + <body> + The Hive DR workflow ${wf:id()} is successful. + Source = ${sourceCluster} + Target = ${targetCluster} + DB Name = ${sourceDatabase} + Table Name = ${sourceTable} + </body> + </email> + <ok to="end"/> + <error to="end"/> + </action> + <action name="failureAlert"> + <email xmlns="uri:oozie:email-action:0.2"> + <to>${drNotificationReceivers}</to> + <subject>ERROR: Hive DR workflow ${drJobName} failed</subject> + <body> + The Hive DR workflow ${wf:id()} had issues and was killed. The error message is: ${wf:errorMessage(wf:lastErrorNode())} + Source = ${sourceCluster} + Target = ${targetCluster} + DB Name = ${sourceDatabase} + Table Name = ${sourceTable} + </body> + </email> + <ok to="end"/> + <error to="fail"/> + </action> + <kill name="fail"> + <message> + Workflow action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + </message> + </kill> + <end name="end"/> +</workflow-app> http://git-wip-us.apache.org/repos/asf/falcon/blob/f9669000/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure.properties ---------------------------------------------------------------------- diff --git a/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure.properties b/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure.properties new file mode 100644 index 0000000..ff2611f --- /dev/null +++ b/falcon-regression/merlin/src/test/resources/HiveDrSecureRecipe/hive-disaster-recovery-secure.properties @@ -0,0 +1,104 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +##### NOTE: This is a TEMPLATE file which can be copied and edited + +##### Recipe properties +falcon.recipe.name=hive-disaster-recovery + + +##### Workflow properties +falcon.recipe.workflow.name=hive-dr-workflow +# Provide Wf absolute path. This can be HDFS or local FS path. If WF is on local FS it will be copied to HDFS +falcon.recipe.workflow.path=/recipes/hive-replication/hive-disaster-recovery-secure-workflow.xml + +##### Cluster properties + +# Change the cluster name where replication job should run here +falcon.recipe.cluster.name=backupCluster +# Change the cluster hdfs write end point here. This is mandatory. +falcon.recipe.cluster.hdfs.writeEndPoint=hdfs://localhost:8020 +# Change the cluster validity start time here +falcon.recipe.cluster.validity.start=2014-10-01T00:00Z +# Change the cluster validity end time here +falcon.recipe.cluster.validity.end=2016-12-30T00:00Z +# Change the cluster namenode kerberos principal. This is mandatory on secure clusters. +falcon.recipe.nn.principal=nn/[email protected] + +##### Scheduling properties + +# Change the process frequency here. Valid frequency type are minutes, hours, days, months +falcon.recipe.process.frequency=minutes(60) + +##### Retry policy properties + +falcon.recipe.retry.policy=periodic +falcon.recipe.retry.delay=minutes(30) +falcon.recipe.retry.attempts=3 + +##### Tag properties - An optional list of comma separated tags, Key Value Pairs, separated by comma +##### Uncomment to add tags +#falcon.recipe.tags=owner=landing,pipeline=adtech + +##### ACL properties - Uncomment and change ACL if authorization is enabled + +#falcon.recipe.acl.owner=testuser +#falcon.recipe.acl.group=group +#falcon.recipe.acl.permission=0x755 + +##### Custom Job properties + +##### Source Cluster DR properties +sourceCluster=primaryCluster +sourceMetastoreUri=thrift://localhost:9083 +sourceHiveServer2Uri=hive2://localhost:10000 +# For DB level replicaiton to replicate multiple databases specify comma separated list of tables +sourceDatabase=default +# For DB level replication specify * for sourceTable. +# For table level replication to replicate multiple tables specify comma separated list of tables +sourceTable=testtable_dr +sourceStagingPath=/apps/hive/tools/dr +sourceNN=hdfs://localhost:8020 +# Specify kerberos principal required to access source namenode and hive servers, optional on non-secure cluster. +sourceNNKerberosPrincipal=nn/[email protected] +sourceHiveMetastoreKerberosPrincipal=hive/[email protected] +sourceHive2KerberosPrincipal=hive/[email protected] + +##### Target Cluster DR properties +targetCluster=backupCluster +targetMetastoreUri=thrift://localhost:9083 +targetHiveServer2Uri=hive2://localhost:10000 +targetStagingPath=/apps/hive/tools/dr +targetNN=hdfs://localhost:8020 +# Specify kerberos principal required to access target namenode and hive servers, optional on non-secure cluster. +targetNNKerberosPrincipal=nn/[email protected] +targetHiveMetastoreKerberosPrincipal=hive/[email protected] +targetHive2KerberosPrincipal=hive/[email protected] + +# To ceil the max events processed each time job runs. Set it to max value depending on your bandwidth limit. +# Setting it to -1 will process all the events but can hog up the bandwidth. Use it judiciously! +maxEvents=-1 +# Change it to specify the maximum number of mappers for replication +replicationMaxMaps=5 +# Change it to specify the maximum number of mappers for DistCP +distcpMaxMaps=1 +# Change it to specify the bandwidth in MB for each mapper in DistCP +distcpMapBandwidth=100 + +##### Email on failure +drNotificationReceivers=NA http://git-wip-us.apache.org/repos/asf/falcon/blob/f9669000/falcon-regression/pom.xml ---------------------------------------------------------------------- diff --git a/falcon-regression/pom.xml b/falcon-regression/pom.xml index 1490313..7833b69 100644 --- a/falcon-regression/pom.xml +++ b/falcon-regression/pom.xml @@ -39,8 +39,10 @@ </modules> <properties> - <oozie.version>4.0.0.2.1.7.0-784</oozie.version> - <hive.version>0.13.1</hive.version> + <oozie.version>4.1.0</oozie.version> + <hive.version>1.2.1</hive.version> + <hcatalog.version>1.2.1</hcatalog.version> + <hadoop.version>2.7.1</hadoop.version> <testng.exclude.pattern>nothing</testng.exclude.pattern> <testng.include.pattern>**/Test*.java,**/*Test.java,**/*TestCase.java</testng.include.pattern> </properties> @@ -48,6 +50,9 @@ <profiles> <profile> <id>hadoop-2</id> + <activation> + <activeByDefault>true</activeByDefault> + </activation> <dependencyManagement> <dependencies> <dependency> @@ -84,6 +89,12 @@ <dependency> <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-distcp</artifactId> + <version>${hadoop.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-server-resourcemanager</artifactId> <version>${hadoop.version}</version> </dependency> @@ -95,6 +106,12 @@ </dependency> <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-azure</artifactId> + <version>${hadoop.version}</version> + </dependency> + + <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-common</artifactId> <version>${hive.version}</version> @@ -111,6 +128,19 @@ <artifactId>hive-hcatalog-core</artifactId> <version>${hive.version}</version> </dependency> + + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-jdbc</artifactId> + <version>${hive.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-metastore</artifactId> + <version>${hive.version}</version> + </dependency> + </dependencies> </dependencyManagement> </profile> @@ -231,6 +261,13 @@ <artifactId>guava</artifactId> <version>18.0</version> </dependency> + + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.4</version> + </dependency> + </dependencies> </dependencyManagement> @@ -328,10 +365,15 @@ <!-- Hack to get the dir in CP through CLI and idea --> <additionalClasspathElement>${hadoop.conf.dir}</additionalClasspathElement> <additionalClasspathElement>${hive.conf.dir}</additionalClasspathElement> + <additionalClasspathElement>${falcon.conf.dir}</additionalClasspathElement> <additionalClasspathElement>merlin/src/test/resources/hadoop-conf </additionalClasspathElement> <additionalClasspathElement>src/test/resources/hadoop-conf </additionalClasspathElement> + <additionalClasspathElement>merlin/src/test/resources/falcon-conf + </additionalClasspathElement> + <additionalClasspathElement>src/test/resources/falcon-conf + </additionalClasspathElement> </additionalClasspathElements> <properties> <property>
