Author: jdcryans
Date: Mon Feb 13 23:12:55 2012
New Revision: 1243733
URL: http://svn.apache.org/viewvc?rev=1243733&view=rev
Log:
HBASE-5197 [replication] Handle socket timeouts in ReplicationSource
to prevent DDOS
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java?rev=1243733&r1=1243732&r2=1243733&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
Mon Feb 13 23:12:55 2012
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.replicat
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
@@ -124,7 +125,9 @@ public class ReplicationSource extends T
// List of all the dead region servers that had this queue (if recovered)
private String[] deadRegionServers;
// Maximum number of retries before taking bold actions
- private long maxRetriesMultiplier;
+ private int maxRetriesMultiplier;
+ // Socket timeouts require even bolder actions since we don't want to DDOS
+ private int socketTimeoutMultiplier;
// Current number of entries that we need to replicate
private int currentNbEntries = 0;
// Current number of operations (Put/Delete) that we need to replicate
@@ -166,7 +169,8 @@ public class ReplicationSource extends T
this.entriesArray[i] = new HLog.Entry();
}
this.maxRetriesMultiplier =
- this.conf.getLong("replication.source.maxretriesmultiplier", 10);
+ this.conf.getInt("replication.source.maxretriesmultiplier", 10);
+ this.socketTimeoutMultiplier = maxRetriesMultiplier * maxRetriesMultiplier;
this.queue =
new PriorityBlockingQueue<Path>(
conf.getInt("hbase.regionserver.maxlogs", 32),
@@ -620,8 +624,19 @@ public class ReplicationSource extends T
ioe = ((RemoteException) ioe).unwrapRemoteException();
LOG.warn("Can't replicate because of an error on the remote cluster:
", ioe);
} else {
- LOG.warn("Can't replicate because of a local or network error: ",
ioe);
+ if (ioe instanceof SocketTimeoutException) {
+ // This exception means we waited for more than 60s and nothing
+ // happened, the cluster is alive and calling it right away
+ // even for a test just makes things worse.
+ sleepForRetries("Encountered a SocketTimeoutException. Since the" +
+ "call to the remote cluster timed out, which is usually " +
+ "caused by a machine failure or a massive slowdown",
+ this.socketTimeoutMultiplier);
+ } else {
+ LOG.warn("Can't replicate because of a local or network error: ",
ioe);
+ }
}
+
try {
boolean down;
// Spin while the slave is down and we're not asked to shutdown/close