[
https://issues.apache.org/jira/browse/HDFS-15798?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
huhaiyang updated HDFS-15798:
-----------------------------
Description:
The EC reconstruct task failed, and the decrementXmitsInProgress operation will
be performed twice
It would be XmitsInProgress of DN has negative number
{code:java}
// 1.ErasureCodingWorker.java
public void processErasureCodingTasks(
Collection<BlockECReconstructionInfo> ecTasks) {
for (BlockECReconstructionInfo reconInfo : ecTasks) {
int xmitsSubmitted = 0;
try {
...
// It may throw IllegalArgumentException from task#stripedReader
// constructor.
final StripedBlockReconstructor task =
new StripedBlockReconstructor(this, stripedReconInfo);
if (task.hasValidTargets()) {
// See HDFS-12044. We increase xmitsInProgress even the task is only
// enqueued, so that
// 1) NN will not send more tasks than what DN can execute and
// 2) DN will not throw away reconstruction tasks, and instead keeps
// an unbounded number of tasks in the executor's task queue.
xmitsSubmitted = Math.max((int)(task.getXmits() * xmitWeight), 1);
getDatanode().incrementXmitsInProcess(xmitsSubmitted); // 1.task start
increment
stripedReconstructionPool.submit(task);
} else {
LOG.warn("No missing internal block. Skip reconstruction for task:{}",
reconInfo);
}
} catch (Throwable e) {
getDatanode().decrementXmitsInProgress(xmitsSubmitted); // 2.2. task
failed decrement
LOG.warn("Failed to reconstruct striped block {}",
reconInfo.getExtendedBlock().getLocalBlock(), e);
}
}
}
// 2.StripedBlockReconstructor.java
public void run() {
try {
initDecoderIfNecessary();
...
} catch (Throwable e) {
LOG.warn("Failed to reconstruct striped block: {}", getBlockGroup(), e);
getDatanode().getMetrics().incrECFailedReconstructionTasks();
} finally {
float xmitWeight = getErasureCodingWorker().getXmitWeight();
// if the xmits is smaller than 1, the xmitsSubmitted should be set to 1
// because if it set to zero, we cannot to measure the xmits submitted
int xmitsSubmitted = Math.max((int) (getXmits() * xmitWeight), 1);
getDatanode().decrementXmitsInProgress(xmitsSubmitted); // 2.1. task failed
decrement
...
}
}{code}
was:
The EC reconstruct task failed, and the decrementXmitsInProgress operation will
be performed twice
It would be XmitsInProgress of DN has negative number
{code:java}
// 1.ErasureCodingWorker.java
public void processErasureCodingTasks(
Collection<BlockECReconstructionInfo> ecTasks) {
for (BlockECReconstructionInfo reconInfo : ecTasks) {
int xmitsSubmitted = 0;
try {
...
// It may throw IllegalArgumentException from task#stripedReader
// constructor.
final StripedBlockReconstructor task =
new StripedBlockReconstructor(this, stripedReconInfo);
if (task.hasValidTargets()) {
// See HDFS-12044. We increase xmitsInProgress even the task is only
// enqueued, so that
// 1) NN will not send more tasks than what DN can execute and
// 2) DN will not throw away reconstruction tasks, and instead keeps
// an unbounded number of tasks in the executor's task queue.
xmitsSubmitted = Math.max((int)(task.getXmits() * xmitWeight), 1);
getDatanode().incrementXmitsInProcess(xmitsSubmitted); // task start
increment
stripedReconstructionPool.submit(task);
} else {
LOG.warn("No missing internal block. Skip reconstruction for task:{}",
reconInfo);
}
} catch (Throwable e) {
getDatanode().decrementXmitsInProgress(xmitsSubmitted); // 2. task
failed decrement
LOG.warn("Failed to reconstruct striped block {}",
reconInfo.getExtendedBlock().getLocalBlock(), e);
}
}
}
// 2.StripedBlockReconstructor.java
public void run() {
try {
initDecoderIfNecessary();
...
} catch (Throwable e) {
LOG.warn("Failed to reconstruct striped block: {}", getBlockGroup(), e);
getDatanode().getMetrics().incrECFailedReconstructionTasks();
} finally {
float xmitWeight = getErasureCodingWorker().getXmitWeight();
// if the xmits is smaller than 1, the xmitsSubmitted should be set to 1
// because if it set to zero, we cannot to measure the xmits submitted
int xmitsSubmitted = Math.max((int) (getXmits() * xmitWeight), 1);
getDatanode().decrementXmitsInProgress(xmitsSubmitted); // 1. task failed
decrement
...
}
}{code}
> EC: Reconstruct task failed, and the decrementXmitsInProgress operation will
> be performed twice
> -----------------------------------------------------------------------------------------------
>
> Key: HDFS-15798
> URL: https://issues.apache.org/jira/browse/HDFS-15798
> Project: Hadoop HDFS
> Issue Type: Bug
> Reporter: huhaiyang
> Priority: Major
>
> The EC reconstruct task failed, and the decrementXmitsInProgress operation
> will be performed twice
> It would be XmitsInProgress of DN has negative number
>
> {code:java}
> // 1.ErasureCodingWorker.java
> public void processErasureCodingTasks(
> Collection<BlockECReconstructionInfo> ecTasks) {
> for (BlockECReconstructionInfo reconInfo : ecTasks) {
> int xmitsSubmitted = 0;
> try {
> ...
> // It may throw IllegalArgumentException from task#stripedReader
> // constructor.
> final StripedBlockReconstructor task =
> new StripedBlockReconstructor(this, stripedReconInfo);
> if (task.hasValidTargets()) {
> // See HDFS-12044. We increase xmitsInProgress even the task is only
> // enqueued, so that
> // 1) NN will not send more tasks than what DN can execute and
> // 2) DN will not throw away reconstruction tasks, and instead keeps
> // an unbounded number of tasks in the executor's task queue.
> xmitsSubmitted = Math.max((int)(task.getXmits() * xmitWeight), 1);
> getDatanode().incrementXmitsInProcess(xmitsSubmitted); // 1.task
> start increment
> stripedReconstructionPool.submit(task);
> } else {
> LOG.warn("No missing internal block. Skip reconstruction for task:{}",
> reconInfo);
> }
> } catch (Throwable e) {
> getDatanode().decrementXmitsInProgress(xmitsSubmitted); // 2.2. task
> failed decrement
> LOG.warn("Failed to reconstruct striped block {}",
> reconInfo.getExtendedBlock().getLocalBlock(), e);
> }
> }
> }
> // 2.StripedBlockReconstructor.java
> public void run() {
> try {
> initDecoderIfNecessary();
> ...
> } catch (Throwable e) {
> LOG.warn("Failed to reconstruct striped block: {}", getBlockGroup(), e);
> getDatanode().getMetrics().incrECFailedReconstructionTasks();
> } finally {
> float xmitWeight = getErasureCodingWorker().getXmitWeight();
> // if the xmits is smaller than 1, the xmitsSubmitted should be set to 1
> // because if it set to zero, we cannot to measure the xmits submitted
> int xmitsSubmitted = Math.max((int) (getXmits() * xmitWeight), 1);
> getDatanode().decrementXmitsInProgress(xmitsSubmitted); // 2.1. task
> failed decrement
> ...
> }
> }{code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]