Hi,
We are running Oozie 4.0.0 with a Derby database in a Cloudera 5.1.3
cluster.
The workflow has ~ 400 actions. Workflow sample attached.
The workflow has been running successfuly for many days.
At some point I changed the workflow to run only a small subset of the
originally 400 actions.
At this point Oozie execution did not correspond to the workflow any more.
It ran the actions specified
in the small subset but it also ran many other actions from the original
400 set. It did not run the whole
400 set though.
At first I thought that there was some problem with the new script or
configuration. After many
unsuccessful attempts to solve the problem while changing the script or
configuration,
I discovered that the only way to return to correct execution was to
completely erase the Derby database
and recreate the database again. After that Oozie started to obey the
workflow specification again.
Also after recreating the database Oozie performance improved.
The issue did not reproduce but we worry it will hit us again.
I was wondering if you ever encountered something similar. Can you please
point us to some direction
to avoid this issue in the future .
Appreciate your help.
Regards,
Alex
<workflow-app name="tablesWorkflow" xmlns="uri:oozie:workflow:0.5">
<start to="table_1"/>
<action name="table_1">
<sqoop xmlns="uri:oozie:sqoop-action:0.4">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>import</arg>
<arg>--connect</arg>
<arg>jdbc:mysql://${db}/part${part}?useOldUTF8Behavior=true</arg>
<arg>--username</arg>
<arg>usr</arg>
<arg>--password</arg>
<arg>pwd</arg>
<arg>--table</arg>
<arg>table_1</arg>
<arg>--as-avrodatafile</arg>
<arg>--target-dir</arg>
<arg>/data/path/db_shard_id=${part}</arg>
<arg>--compress</arg>
<arg>--split-by</arg>
<arg>site_id</arg>
<arg>-m</arg>
<arg>1</arg>
<arg>--where</arg>
<arg>TRUE /* NOKILL */</arg>
</sqoop>
<ok to="table_2"/>
<error to="kill"/>
</action>
<action name="table_2">
<sqoop xmlns="uri:oozie:sqoop-action:0.4">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>import</arg>
<arg>--connect</arg>
<arg>jdbc:mysql://${db}/part${part}?useOldUTF8Behavior=true</arg>
<arg>--username</arg>
<arg>usr</arg>
<arg>--password</arg>
<arg>pwd</arg>
<arg>--table</arg>
<arg>table_2</arg>
<arg>--as-avrodatafile</arg>
<arg>--target-dir</arg>
<arg>/data/path/db_shard_id=${part}</arg>
<arg>--compress</arg>
<arg>--split-by</arg>
<arg>site_id</arg>
<arg>-m</arg>
<arg>1</arg>
<arg>--where</arg>
<arg>TRUE /* NOKILL */</arg>
</sqoop>
<ok to="table_3"/>
<error to="kill"/>
</action>
<action name="table_3">
<sqoop xmlns="uri:oozie:sqoop-action:0.4">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>import</arg>
<arg>--connect</arg>
<arg>jdbc:mysql://${db}/part${part}?useOldUTF8Behavior=true</arg>
<arg>--username</arg>
<arg>usr</arg>
<arg>--password</arg>
<arg>pwd</arg>
<arg>--table</arg>
<arg>table_3</arg>
<arg>--as-avrodatafile</arg>
<arg>--target-dir</arg>
<arg>/data/path/db_shard_id=${part}</arg>
<arg>--compress</arg>
<arg>--split-by</arg>
<arg>site_id</arg>
<arg>-m</arg>
<arg>1</arg>
<arg>--where</arg>
<arg>TRUE /* NOKILL */</arg>
</sqoop>
<ok to="end"/>
<error to="kill"/>
</action>
<!-- end -->
<kill name="kill">
<message>Action failed</message>
</kill>
<end name="end"/>
</workflow-app>
<workflow-app name="databasesWorkflow" xmlns="uri:oozie:workflow:0.5">
<start to="cleanHdfs"/>
<action name="cleanHdfs">
<fs>
<delete path="hdfs://${nameNode}/data/alex"/>
</fs>
<ok to="forkActions"/>
<error to="forkActions"/>
</action>
<fork name="forkActions">
<path start="site_db1_1"/>
<path start="site_db2_1"/>
<path start="site_db3_1"/>
<path start="site_db4_1"/>
<path start="site_db5_1"/>
<path start="site_db6_1"/>
<path start="site_db7_1"/>
<path start="site_db8_1"/>
<path start="site_db9_1"/>
<path start="site_db10_1"/>
<path start="site_db11_1"/>
<path start="site_db12_1"/>
<path start="site_db13_1"/>
<path start="site_db14_1"/>
<path start="site_db15_1"/>
<path start="site_db16_1"/>
<path start="site_db17_1"/>
<path start="site_db18_1"/>
<path start="site_db19_1"/>
<path start="site_db20_1"/>
</fork>
<!-- db1 -->
<action name="site_db1_1">
<sub-workflow>
<app-path>${exampleDir}/tables-workflow.xml</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>part</name>
<value>1</value>
</property>
<property>
<name>db</name>
<value>ip1:port1</value>
</property>
</configuration>
</sub-workflow>
<ok to="site_db1_2"/>
<error to="joinActions"/>
</action>
<action name="site_db1_2">
<sub-workflow>
<app-path>${exampleDir}/tables-workflow.xml</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>part</name>
<value>2</value>
</property>
<property>
<name>db</name>
<value>ip2:port2</value>
</property>
</configuration>
</sub-workflow>
<ok to="site_db1_3"/>
<error to="joinActions"/>
</action>
<action name="site_db1_3">
<sub-workflow>
<app-path>${exampleDir}/tables-workflow.xml</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>part</name>
<value>3</value>
</property>
<property>
<name>db</name>
<value>ip3:port3</value>
</property>
</configuration>
</sub-workflow>
<ok to="site_db1_4"/>
<error to="joinActions"/>
</action>
..................
..................
<!-- end -->
<join name="joinActions" to="end"/>
<end name="end"/>
</workflow-app>