Author: jdcryans
Date: Mon Feb 13 23:13:39 2012
New Revision: 1243735
URL: http://svn.apache.org/viewvc?rev=1243735&view=rev
Log:
HBASE-5197 [replication] Handle socket timeouts in ReplicationSource
to prevent DDOS
Modified:
hbase/branches/0.90/CHANGES.txt
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Modified: hbase/branches/0.90/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/CHANGES.txt?rev=1243735&r1=1243734&r2=1243735&view=diff
==============================================================================
--- hbase/branches/0.90/CHANGES.txt (original)
+++ hbase/branches/0.90/CHANGES.txt Mon Feb 13 23:13:39 2012
@@ -50,6 +50,8 @@ Release 0.90.6 - Unreleased
HBASE-5102 Change the default value of the property
"hbase.connection.per.config" to false in
hbase-default.xml
HBASE-5327 Print a message when an invalid hbase.rootdir is passed (Jimmy
Xiang)
+ HBASE-5197 [replication] Handle socket timeouts in ReplicationSource
+ to prevent DDOS
Release 0.90.5 - Dec 22, 2011
BUG FIXES
Modified:
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java?rev=1243735&r1=1243734&r2=1243735&view=diff
==============================================================================
---
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
(original)
+++
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Mon Feb 13 23:13:39 2012
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.replicat
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
@@ -118,7 +119,9 @@ public class ReplicationSource extends T
// List of all the dead region servers that had this queue (if recovered)
private String[] deadRegionServers;
// Maximum number of retries before taking bold actions
- private long maxRetriesMultiplier;
+ private int maxRetriesMultiplier;
+ // Socket timeouts require even bolder actions since we don't want to DDOS
+ private int socketTimeoutMultiplier;
// Current number of entries that we need to replicate
private int currentNbEntries = 0;
// Current number of operations (Put/Delete) that we need to replicate
@@ -160,7 +163,8 @@ public class ReplicationSource extends T
this.entriesArray[i] = new HLog.Entry();
}
this.maxRetriesMultiplier =
- this.conf.getLong("replication.source.maxretriesmultiplier", 10);
+ this.conf.getInt("replication.source.maxretriesmultiplier", 10);
+ this.socketTimeoutMultiplier = maxRetriesMultiplier * maxRetriesMultiplier;
this.queue =
new PriorityBlockingQueue<Path>(
conf.getInt("hbase.regionserver.maxlogs", 32),
@@ -583,8 +587,19 @@ public class ReplicationSource extends T
ioe = ((RemoteException) ioe).unwrapRemoteException();
LOG.warn("Can't replicate because of an error on the remote cluster:
", ioe);
} else {
- LOG.warn("Can't replicate because of a local or network error: ",
ioe);
+ if (ioe instanceof SocketTimeoutException) {
+ // This exception means we waited for more than 60s and nothing
+ // happened, the cluster is alive and calling it right away
+ // even for a test just makes things worse.
+ sleepForRetries("Encountered a SocketTimeoutException. Since the" +
+ "call to the remote cluster timed out, which is usually " +
+ "caused by a machine failure or a massive slowdown",
+ this.socketTimeoutMultiplier);
+ } else {
+ LOG.warn("Can't replicate because of a local or network error: ",
ioe);
+ }
}
+
try {
boolean down;
// Spin while the slave is down and we're not asked to shutdown/close