[
https://issues.apache.org/jira/browse/MAPREDUCE-6076?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
zhihai xu updated MAPREDUCE-6076:
---------------------------------
Description:
Zero map split input length combine with none zero map split input length may
cause MR1 job hung sometimes.
This problem may happen when use HBASE input split(TableSplit).
HBASE split input length can be zero for unknown regions or non-zero for known
regions in the following code:
{code}
// TableSplit.java
public long getLength() {
return length;
}
// RegionSizeCalculator.java
public long getRegionSize(byte[] regionId) {
Long size = sizeMap.get(regionId);
if (size == null) {
LOG.debug("Unknown region:" + Arrays.toString(regionId));
return 0;
} else {
return size;
}
}
{code}
The TableSplit length come from RegionSizeCalculator.getRegionSize.
The job hung is because in MR1,
If these zero split input length map tasks are scheduled and completed before
all none zero split input length map tasks are scheduled,
Scheduling new map task in JobProgress.java will be failed to pass the
TaskTracker resources check at.
{code}
// findNewMapTask
// Check to ensure this TaskTracker has enough resources to
// run tasks from this job
long outSize = resourceEstimator.getEstimatedMapOutputSize();
long availSpace = tts.getResourceStatus().getAvailableSpace();
if(availSpace < outSize) {
LOG.warn("No room for map task. Node " + tts.getHost() +
" has " + availSpace +
" bytes free; but we expect map to take " + outSize);
return -1; //see if a different TIP might work better.
}
{code}
The resource calculation is at
{code}
// in ResourceEstimator.java
protected synchronized long getEstimatedTotalMapOutputSize() {
if(completedMapsUpdates < threshholdToUse) {
return 0;
} else {
long inputSize = job.getInputLength() + job.desiredMaps();
//add desiredMaps() so that randomwriter case doesn't blow up
//the multiplication might lead to overflow, casting it with
//double prevents it
long estimate = Math.round(((double)inputSize *
completedMapsOutputSize * 2.0)/completedMapsInputSize);
if (LOG.isDebugEnabled()) {
LOG.debug("estimate total map output will be " + estimate);
}
return estimate;
}
}
protected synchronized void updateWithCompletedTask(TaskStatus ts,
TaskInProgress tip) {
//-1 indicates error, which we don't average in.
if(tip.isMapTask() && ts.getOutputSize() != -1) {
completedMapsUpdates++;
completedMapsInputSize+=(tip.getMapInputSize()+1);
completedMapsOutputSize+=ts.getOutputSize();
if(LOG.isDebugEnabled()) {
LOG.debug("completedMapsUpdates:"+completedMapsUpdates+" "+
"completedMapsInputSize:"+completedMapsInputSize+" " +
"completedMapsOutputSize:"+completedMapsOutputSize);
}
}
}
{code}
You can see in the calculation:
completedMapsInputSize will be a very small number and inputSize *
completedMapsOutputSize will be a very big number
For example, completedMapsInputSize = 1; inputSize = 100MBytes and
completedMapsOutputSize=100MBytes,
The estimate will be 5000TB which will be more than most task tracker disk
space size.
So I think if the map split input length is 0, it means the split input length
is unknown and it is reasonable to use map output size as input size for the
calculation in ResourceEstimator. I will upload a fix based on this method.
was:
Zero map split input length combine with none zero map split input length will
cause MR1 job hung.
This problem may happen when use HBASE input split(TableSplit).
HBASE split input length can be zero for unknown regions or non-zero for known
regions in the following code:
{code}
// TableSplit.java
public long getLength() {
return length;
}
// RegionSizeCalculator.java
public long getRegionSize(byte[] regionId) {
Long size = sizeMap.get(regionId);
if (size == null) {
LOG.debug("Unknown region:" + Arrays.toString(regionId));
return 0;
} else {
return size;
}
}
{code}
The TableSplit length come from RegionSizeCalculator.getRegionSize.
The job hung is because in MR1,
If these zero split input length map tasks are scheduled and completed before
all none zero split input length map tasks are scheduled,
Scheduling new map task in JobProgress.java will be failed to pass the
TaskTracker resources check at.
{code}
// findNewMapTask
// Check to ensure this TaskTracker has enough resources to
// run tasks from this job
long outSize = resourceEstimator.getEstimatedMapOutputSize();
long availSpace = tts.getResourceStatus().getAvailableSpace();
if(availSpace < outSize) {
LOG.warn("No room for map task. Node " + tts.getHost() +
" has " + availSpace +
" bytes free; but we expect map to take " + outSize);
return -1; //see if a different TIP might work better.
}
{code}
The resource calculation is at
{code}
// in ResourceEstimator.java
protected synchronized long getEstimatedTotalMapOutputSize() {
if(completedMapsUpdates < threshholdToUse) {
return 0;
} else {
long inputSize = job.getInputLength() + job.desiredMaps();
//add desiredMaps() so that randomwriter case doesn't blow up
//the multiplication might lead to overflow, casting it with
//double prevents it
long estimate = Math.round(((double)inputSize *
completedMapsOutputSize * 2.0)/completedMapsInputSize);
if (LOG.isDebugEnabled()) {
LOG.debug("estimate total map output will be " + estimate);
}
return estimate;
}
}
protected synchronized void updateWithCompletedTask(TaskStatus ts,
TaskInProgress tip) {
//-1 indicates error, which we don't average in.
if(tip.isMapTask() && ts.getOutputSize() != -1) {
completedMapsUpdates++;
completedMapsInputSize+=(tip.getMapInputSize()+1);
completedMapsOutputSize+=ts.getOutputSize();
if(LOG.isDebugEnabled()) {
LOG.debug("completedMapsUpdates:"+completedMapsUpdates+" "+
"completedMapsInputSize:"+completedMapsInputSize+" " +
"completedMapsOutputSize:"+completedMapsOutputSize);
}
}
}
{code}
You can see in the calculation:
completedMapsInputSize will be a very small number and inputSize *
completedMapsOutputSize will be a very big number
For example, completedMapsInputSize = 1; inputSize = 100MBytes and
completedMapsOutputSize=100MBytes,
The estimate will be 5000TB which will be more than most task tracker disk
space size.
So I think if the map split input length is 0, it means the split input length
is unknown and it is reasonable to use map output size as input size for the
calculation in ResourceEstimator. I will upload a fix based on this method.
> Zero map split input length combine with none zero map split input length
> may cause MR1 job hung sometimes.
> -------------------------------------------------------------------------------------------------------------
>
> Key: MAPREDUCE-6076
> URL: https://issues.apache.org/jira/browse/MAPREDUCE-6076
> Project: Hadoop Map/Reduce
> Issue Type: Bug
> Components: mrv1
> Reporter: zhihai xu
> Assignee: zhihai xu
> Fix For: 1.3.0
>
> Attachments: MAPREDUCE-6076.branch-1.000.patch
>
>
> Zero map split input length combine with none zero map split input length may
> cause MR1 job hung sometimes.
> This problem may happen when use HBASE input split(TableSplit).
> HBASE split input length can be zero for unknown regions or non-zero for
> known regions in the following code:
> {code}
> // TableSplit.java
> public long getLength() {
> return length;
> }
> // RegionSizeCalculator.java
> public long getRegionSize(byte[] regionId) {
> Long size = sizeMap.get(regionId);
> if (size == null) {
> LOG.debug("Unknown region:" + Arrays.toString(regionId));
> return 0;
> } else {
> return size;
> }
> }
> {code}
> The TableSplit length come from RegionSizeCalculator.getRegionSize.
> The job hung is because in MR1,
> If these zero split input length map tasks are scheduled and completed before
> all none zero split input length map tasks are scheduled,
> Scheduling new map task in JobProgress.java will be failed to pass the
> TaskTracker resources check at.
> {code}
> // findNewMapTask
> // Check to ensure this TaskTracker has enough resources to
> // run tasks from this job
> long outSize = resourceEstimator.getEstimatedMapOutputSize();
> long availSpace = tts.getResourceStatus().getAvailableSpace();
> if(availSpace < outSize) {
> LOG.warn("No room for map task. Node " + tts.getHost() +
> " has " + availSpace +
> " bytes free; but we expect map to take " + outSize);
> return -1; //see if a different TIP might work better.
> }
> {code}
> The resource calculation is at
> {code}
> // in ResourceEstimator.java
> protected synchronized long getEstimatedTotalMapOutputSize() {
> if(completedMapsUpdates < threshholdToUse) {
> return 0;
> } else {
> long inputSize = job.getInputLength() + job.desiredMaps();
> //add desiredMaps() so that randomwriter case doesn't blow up
> //the multiplication might lead to overflow, casting it with
> //double prevents it
> long estimate = Math.round(((double)inputSize *
> completedMapsOutputSize * 2.0)/completedMapsInputSize);
> if (LOG.isDebugEnabled()) {
> LOG.debug("estimate total map output will be " + estimate);
> }
> return estimate;
> }
> }
> protected synchronized void updateWithCompletedTask(TaskStatus ts,
> TaskInProgress tip) {
> //-1 indicates error, which we don't average in.
> if(tip.isMapTask() && ts.getOutputSize() != -1) {
> completedMapsUpdates++;
> completedMapsInputSize+=(tip.getMapInputSize()+1);
> completedMapsOutputSize+=ts.getOutputSize();
> if(LOG.isDebugEnabled()) {
> LOG.debug("completedMapsUpdates:"+completedMapsUpdates+" "+
> "completedMapsInputSize:"+completedMapsInputSize+" " +
> "completedMapsOutputSize:"+completedMapsOutputSize);
> }
> }
> }
> {code}
> You can see in the calculation:
> completedMapsInputSize will be a very small number and inputSize *
> completedMapsOutputSize will be a very big number
> For example, completedMapsInputSize = 1; inputSize = 100MBytes and
> completedMapsOutputSize=100MBytes,
> The estimate will be 5000TB which will be more than most task tracker disk
> space size.
> So I think if the map split input length is 0, it means the split input
> length is unknown and it is reasonable to use map output size as input size
> for the calculation in ResourceEstimator. I will upload a fix based on this
> method.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)