Author: ddas
Date: Mon Jun 15 06:16:59 2009
New Revision: 784664
URL: http://svn.apache.org/viewvc?rev=784664&view=rev
Log:
Merge -r 784660:784661 from trunk onto 0.20 branch. Had to apply the testcase
part of the patch manually though since the patch was only for trunk. Fixes
HADOOP-5921.
Modified:
hadoop/core/branches/branch-0.20/ (props changed)
hadoop/core/branches/branch-0.20/CHANGES.txt (contents, props changed)
hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
Propchange: hadoop/core/branches/branch-0.20/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Mon Jun 15 06:16:59 2009
@@ -1,2 +1,2 @@
/hadoop/core/branches/branch-0.19:713112
-/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746338,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755960,755986,755998,756352,757448,757624,757849,758156,758180,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,771661,772844,772876,772884,772920,773889,776638,778962,778966,779893,781720
+/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746338,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755960,755986,755998,756352,757448,757624,757849,758156,758180,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,771661,772844,772876,772884,772920,773889,776638,778962,778966,779893,781720,784661
Modified: hadoop/core/branches/branch-0.20/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/CHANGES.txt?rev=784664&r1=784663&r2=784664&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.20/CHANGES.txt Mon Jun 15 06:16:59 2009
@@ -129,6 +129,12 @@
causing TestQueueCapacities to fail.
(Sreekanth Ramakrishnan via yhemanth)
+ HADOOP-5921. Fixes a problem in the JobTracker where it sometimes never
used
+ to come up due to a system file creation on JobTracker's system-dir
failing.
+ This problem would sometimes show up only when the FS for the system-dir
+ (usually HDFS) is started at nearly the same time as the JobTracker.
+ (Amar Kamat via ddas)
+
Release 0.20.0 - 2009-04-15
INCOMPATIBLE CHANGES
Propchange: hadoop/core/branches/branch-0.20/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Mon Jun 15 06:16:59 2009
@@ -1,3 +1,3 @@
/hadoop/core/branches/branch-0.18/CHANGES.txt:727226
/hadoop/core/branches/branch-0.19/CHANGES.txt:713112
-/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752514,752555,752590,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755986,755998,756352,757448,757624,757849,758156,758180,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,772844,772876,772884,772920,773889,776638,778962,778966,779893,781720
+/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752514,752555,752590,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755986,755998,756352,757448,757624,757849,758156,758180,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,772844,772876,772884,772920,773889,776638,778962,778966,779893,781720,784661
Modified:
hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=784664&r1=784663&r2=784664&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
(original)
+++
hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
Mon Jun 15 06:16:59 2009
@@ -111,7 +111,7 @@
private int MAX_BLACKLISTS_PER_TRACKER = 4;
public static enum State { INITIALIZING, RUNNING }
State state = State.INITIALIZING;
- private static final int SYSTEM_DIR_CLEANUP_RETRY_PERIOD = 10000;
+ private static final int FS_ACCESS_RETRY_PERIOD = 10000;
private DNSToSwitchMapping dnsToSwitchMapping;
private NetworkTopology clusterMap = new NetworkTopology();
@@ -1165,17 +1165,38 @@
shouldRecover = false;
// write the jobtracker.info file
- FSDataOutputStream out = FileSystem.create(fs, restartFile, filePerm);
- out.writeInt(0);
- out.close();
+ try {
+ FSDataOutputStream out = FileSystem.create(fs, restartFile,
+ filePerm);
+ out.writeInt(0);
+ out.close();
+ } catch (IOException ioe) {
+ LOG.warn("Writing to file " + restartFile + " failed!");
+ LOG.warn("FileSystem is not ready yet!");
+ fs.delete(restartFile, false);
+ throw ioe;
+ }
return;
}
FSDataInputStream in = fs.open(restartFile);
- // read the old count
- restartCount = in.readInt();
- ++restartCount; // increment the restart count
- in.close();
+ try {
+ // read the old count
+ restartCount = in.readInt();
+ ++restartCount; // increment the restart count
+ } catch (IOException ioe) {
+ LOG.warn("System directory is garbled. Failed to read file "
+ + restartFile);
+ LOG.warn("Jobtracker recovery is not possible with garbled"
+ + " system directory! Please delete the system directory and"
+ + " restart the jobtracker. Note that deleting the system"
+ + " directory will result in loss of all the running jobs.");
+ throw new RuntimeException(ioe);
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
// Write back the new restart count and rename the old info file
//TODO This is similar to jobhistory recovery, maybe this common code
@@ -1664,24 +1685,7 @@
}
LOG.info("problem cleaning system directory: " + systemDir, ie);
}
- Thread.sleep(SYSTEM_DIR_CLEANUP_RETRY_PERIOD);
- }
-
- // Prepare for recovery. This is done irrespective of the status of restart
- // flag.
- try {
- recoveryManager.updateRestartCount();
- } catch (IOException ioe) {
- LOG.warn("Failed to initialize recovery manager. The Recovery manager "
- + "failed to access the system files in the system dir ("
- + getSystemDir() + ").");
- LOG.warn("It might be because the JobTracker failed to read/write system"
- + " files (" + recoveryManager.getRestartCountFile() + " / "
- + recoveryManager.getTempRestartCountFile() + ") or the system "
- + " file " + recoveryManager.getRestartCountFile()
- + " is missing!");
- LOG.warn("Bailing out...");
- throw ioe;
+ Thread.sleep(FS_ACCESS_RETRY_PERIOD);
}
// Same with 'localDir' except it's always on the local disk.
@@ -1776,6 +1780,20 @@
* Run forever
*/
public void offerService() throws InterruptedException, IOException {
+ // Prepare for recovery. This is done irrespective of the status of restart
+ // flag.
+ while (true) {
+ try {
+ recoveryManager.updateRestartCount();
+ break;
+ } catch (IOException ioe) {
+ LOG.warn("Failed to initialize recovery manager. ", ioe);
+ // wait for some time
+ Thread.sleep(FS_ACCESS_RETRY_PERIOD);
+ LOG.warn("Retrying...");
+ }
+ }
+
taskScheduler.start();
// Start the recovery after starting the scheduler
Modified:
hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java?rev=784664&r1=784663&r2=784664&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
(original)
+++
hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
Mon Jun 15 06:16:59 2009
@@ -28,6 +28,7 @@
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.JobTracker.RecoveryManager;
import org.apache.hadoop.mapred.MiniMRCluster.JobTrackerRunner;
import org.apache.hadoop.mapred.TestJobInProgressListener.MyScheduler;
@@ -310,7 +311,7 @@
fs.delete(rFile,false);
// start the jobtracker
- LOG.info("Stopping jobtracker with system files deleted");
+ LOG.info("Starting jobtracker with system files deleted");
mr.startJobTracker();
UtilsForTests.waitForJobTracker(jc);
@@ -394,8 +395,58 @@
LOG.info("Starting jobtracker with fs errors");
mr.startJobTracker();
JobTrackerRunner runner = mr.getJobTrackerRunner();
- assertFalse("Restart count for new job is incorrect", runner.isActive());
+ assertFalse("JobTracker is still alive", runner.isActive());
mr.shutdown();
}
+
+ /**
+ * Test if the jobtracker waits for the info file to be created before
+ * starting.
+ */
+ public void testJobTrackerInfoCreation() throws Exception {
+ LOG.info("Testing jobtracker.info file");
+ MiniDFSCluster dfs = new MiniDFSCluster(new Configuration(), 1, true,
null);
+ String namenode = (dfs.getFileSystem()).getUri().getHost() + ":"
+ + (dfs.getFileSystem()).getUri().getPort();
+ // shut down the data nodes
+ dfs.shutdownDataNodes();
+
+ // start the jobtracker
+ JobConf conf = new JobConf();
+ FileSystem.setDefaultUri(conf, namenode);
+ conf.set("mapred.job.tracker", "localhost:0");
+ conf.set("mapred.job.tracker.http.address", "127.0.0.1:0");
+
+ JobTracker jobtracker = new JobTracker(conf);
+
+ // now check if the update restart count works fine or not
+ boolean failed = false;
+ try {
+ jobtracker.recoveryManager.updateRestartCount();
+ } catch (IOException ioe) {
+ failed = true;
+ }
+ assertTrue("JobTracker created info files without datanodes!!!", failed);
+
+ Path restartFile = jobtracker.recoveryManager.getRestartCountFile();
+ Path tmpRestartFile = jobtracker.recoveryManager.getTempRestartCountFile();
+ FileSystem fs = dfs.getFileSystem();
+ assertFalse("Info file exists after update failure",
+ fs.exists(restartFile));
+ assertFalse("Temporary restart-file exists after update failure",
+ fs.exists(restartFile));
+
+ // start 1 data node
+ dfs.startDataNodes(conf, 1, true, null, null, null, null);
+ dfs.waitActive();
+
+ failed = false;
+ try {
+ jobtracker.recoveryManager.updateRestartCount();
+ } catch (IOException ioe) {
+ failed = true;
+ }
+ assertFalse("JobTracker failed to create info files with datanodes!!!",
failed);
+ }
}