On Mon, Nov 16, 2009 at 07:47:34PM +0100, Lars Ellenberg wrote:
> On Mon, Nov 16, 2009 at 07:10:12PM +0100, Dejan Muhamedagic wrote:
> > Hi,
> >
> > On Mon, Nov 02, 2009 at 12:07:59PM -0500, Sam Tran wrote:
> > > Hi All,
> > >
> > > I followed the thread "Food for thought: add something like cutter to
> > > IPaddr2 (or portblock?) RA"
> > > (http://lists.linux-ha.org/pipermail/linux-ha-dev/2008-October/016196.html)
> > > with great interest.
> > >
> > > I am working on a cluster of Master OpenLDAP servers using PaceMaker
> > > and OpenAIS. The problem I have lies in the replication between the
> > > master server that holds the IP address resource and a replica server.
> > > In the "refreshAndPersist" replication mode that is being used, the
> > > replica polls the master server for updates, then the connection
> > > between the replica and the master server is maintained, and the
> > > replica is waiting for subsequent updates from the master server. In
> > > the event of a failure of the initial master the new master is taking
> > > over the IP address resource, but doesn't know anything about the
> > > previous persist stage, therefore is not able to send new updates to
> > > the replica. An RST needs to be sent to the replica in order to
> > > terminate the existing session and force a polling retry from the
> > > replica, or the replica would wait for the session to time out.
> > >
> > > I was wondering whether some work has been done as far as the
> > > implentation of the tickle ACK feature in IPaddr2 RA is concerned.
> >
> > Not to my knowledge. It would obviously be a good feature. The
> > only thing which is not clear to me is who/how would
> > keep/maintain/synchronize the connections database
>
> As a "best effort" sort of thing, you could do a "depth=X" monitoring
> action in the IPaddr2 RA, which would
> grep "ESTABLISHED" /proc/net/nf_conntrack |
> dd conv=fsync of=/somewhere/on/DRBD/or/NFS/or/iSCSI
>
> On stop, it may (optionally?) truncate that state file.
>
> On start, it would (optionally?) check that state file,
> and send out "Tickle ACKs".
>
> You will miss only those connections that have been
> established since the last "grep", i.e. since the last
> "monitor depth=X". If you want more, use conntrackd.
>
> Volunteers?
>
This is a simple implementation of the tickle ACK feature in IPaddr2
RA. Basically the code is borrowed from ctdb.samba.org, but I haven't
tested it in Heartbeat/openAIS cluster environment yet, so it may not
work for now :)
Thanks,
Jiaju
---
Index: resource-agents/tools/tickle_tcp.c
===================================================================
--- /dev/null
+++ resource-agents/tools/tickle_tcp.c
@@ -0,0 +1,316 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#define discard_const(ptr) ((void *)((intptr_t)(ptr)))
+
+typedef union {
+ struct sockaddr sa;
+ struct sockaddr_in ip;
+ struct sockaddr_in6 ip6;
+} sock_addr;
+
+uint32_t uint16_checksum(uint16_t *data, size_t n)
+{
+ uint32_t sum=0;
+ while (n >= 2) {
+ sum += (uint32_t)ntohs(*data);
+ data++;
+ n -= 2;
+ }
+ if (n == 1) {
+ sum += (uint32_t)ntohs(*(uint8_t *)data);
+ }
+ return sum;
+}
+
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
+{
+ uint32_t sum = uint16_checksum(data, n);
+ uint16_t sum2;
+ sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
+ sizeof(ip->saddr));
+ sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
+ sizeof(ip->daddr));
+ sum += ip->protocol + n;
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum2 = htons(sum);
+ sum2 = ~sum2;
+ if (sum2 == 0) {
+ return 0xFFFF;
+ }
+ return sum2;
+}
+
+static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
+{
+ uint32_t phdr[2];
+ uint32_t sum = 0;
+ uint16_t sum2;
+
+ sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
+ sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
+
+ phdr[0] = htonl(n);
+ phdr[1] = htonl(ip6->ip6_nxt);
+ sum += uint16_checksum((uint16_t *)phdr, 8);
+
+ sum += uint16_checksum(data, n);
+
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum2 = htons(sum);
+ sum2 = ~sum2;
+ if (sum2 == 0) {
+ return 0xFFFF;
+ }
+ return sum2;
+}
+
+void set_nonblocking(int fd)
+{
+ unsigned v;
+ v = fcntl(fd, F_GETFL, 0);
+ fcntl(fd, F_SETFL, v | O_NONBLOCK);
+}
+
+void set_close_on_exec(int fd)
+{
+ unsigned v;
+ v = fcntl(fd, F_GETFD, 0);
+ fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+}
+
+static int parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin)
+{
+ sin->sin_family = AF_INET;
+ sin->sin_port = htons(port);
+
+ if (inet_pton(AF_INET, s, &sin->sin_addr) != 1) {
+ fprintf(stderr, "Failed to translate %s into sin_addr\n", s);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int parse_ipv6(const char *s, const char *iface, unsigned port,
sock_addr *saddr)
+{
+ saddr->ip6.sin6_family = AF_INET6;
+ saddr->ip6.sin6_port = htons(port);
+ saddr->ip6.sin6_flowinfo = 0;
+ saddr->ip6.sin6_scope_id = 0;
+
+ if (inet_pton(AF_INET6, s, &saddr->ip6.sin6_addr) != 1) {
+ fprintf(stderr, "Failed to translate %s into sin6_addr\n", s);
+ return -1;
+ }
+
+ if (iface && IN6_IS_ADDR_LINKLOCAL(&saddr->ip6.sin6_addr)) {
+ saddr->ip6.sin6_scope_id = if_nametoindex(iface);
+ }
+
+ return 0;
+}
+
+int parse_ip(const char *addr, const char *iface, unsigned port, sock_addr
*saddr)
+{
+ char *p;
+ int ret;
+
+ p = index(addr, ':');
+ if (!p)
+ ret = parse_ipv4(addr, port, &saddr->ip);
+ else
+ ret = parse_ipv6(addr, iface, port, saddr);
+
+ return ret;
+}
+
+int parse_ip_port(const char *addr, sock_addr *saddr)
+{
+ char *s, *p;
+ unsigned port;
+ char *endp = NULL;
+ int ret;
+
+ s = strdup(addr);
+ if (!s) {
+ fprintf(stderr, "Failed strdup()\n");
+ return -1;
+ }
+
+ p = rindex(s, ':');
+ if (!p) {
+ fprintf(stderr, "This addr: %s does not contain a port
number\n", s);
+ free(s);
+ return -1;
+ }
+
+ port = strtoul(p+1, &endp, 10);
+ if (!endp || *endp != 0) {
+ fprintf(stderr, "Trailing garbage after the port in %s\n", s);
+ free(s);
+ return -1;
+ }
+ *p = 0;
+
+ ret = parse_ip(s, NULL, port, saddr);
+ free(s);
+ return ret;
+}
+
+int send_tickle_ack(const sock_addr *dst,
+ const sock_addr *src,
+ uint32_t seq, uint32_t ack, int rst)
+{
+ int s;
+ int ret;
+ uint32_t one = 1;
+ uint16_t tmpport;
+ sock_addr *tmpdest;
+ struct {
+ struct iphdr ip;
+ struct tcphdr tcp;
+ } ip4pkt;
+ struct {
+ struct ip6_hdr ip6;
+ struct tcphdr tcp;
+ } ip6pkt;
+
+ switch (src->ip.sin_family) {
+ case AF_INET:
+ memset(&ip4pkt, 0, sizeof(ip4pkt));
+ ip4pkt.ip.version = 4;
+ ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4;
+ ip4pkt.ip.tot_len = htons(sizeof(ip4pkt));
+ ip4pkt.ip.ttl = 255;
+ ip4pkt.ip.protocol = IPPROTO_TCP;
+ ip4pkt.ip.saddr = src->ip.sin_addr.s_addr;
+ ip4pkt.ip.daddr = dst->ip.sin_addr.s_addr;
+ ip4pkt.ip.check = 0;
+
+ ip4pkt.tcp.source = src->ip.sin_port;
+ ip4pkt.tcp.dest = dst->ip.sin_port;
+ ip4pkt.tcp.seq = seq;
+ ip4pkt.tcp.ack_seq = ack;
+ ip4pkt.tcp.ack = 1;
+ if (rst)
+ ip4pkt.tcp.rst = 1;
+ ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4;
+ ip4pkt.tcp.window = htons(1234);
+ ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp,
sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+ s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+ if (s == -1) {
+ fprintf(stderr, "Failed to open raw socket (%s)\n",
strerror(errno));
+ return -1;
+ }
+
+ ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
+ if (ret != 0) {
+ fprintf(stderr, "Failed to setup IP headers (%s)\n",
strerror(errno));
+ close(s);
+ return -1;
+ }
+
+ set_nonblocking(s);
+ set_close_on_exec(s);
+
+ ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
+ (struct sockaddr *)&dst->ip, sizeof(dst->ip));
+ close(s);
+ if (ret != sizeof(ip4pkt)) {
+ fprintf(stderr, "Failed sendto (%s)\n",
strerror(errno));
+ return -1;
+ }
+ break;
+
+ case AF_INET6:
+ memset(&ip6pkt, 0, sizeof(ip6pkt));
+ ip6pkt.ip6.ip6_vfc = 0x60;
+ ip6pkt.ip6.ip6_plen = htons(20);
+ ip6pkt.ip6.ip6_nxt = IPPROTO_TCP;
+ ip6pkt.ip6.ip6_hlim = 64;
+ ip6pkt.ip6.ip6_src = src->ip6.sin6_addr;
+ ip6pkt.ip6.ip6_dst = dst->ip6.sin6_addr;
+
+ ip6pkt.tcp.source = src->ip6.sin6_port;
+ ip6pkt.tcp.dest = dst->ip6.sin6_port;
+ ip6pkt.tcp.seq = seq;
+ ip6pkt.tcp.ack_seq = ack;
+ ip6pkt.tcp.ack = 1;
+ if (rst)
+ ip6pkt.tcp.rst = 1;
+ ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4;
+ ip6pkt.tcp.window = htons(1234);
+ ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp,
sizeof(ip6pkt.tcp), &ip6pkt.ip6);
+
+ s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
+ if (s == -1) {
+ fprintf(stderr, "Failed to open sending socket\n");
+ return -1;
+ }
+
+ tmpdest = discard_const(dst);
+ tmpport = tmpdest->ip6.sin6_port;
+
+ tmpdest->ip6.sin6_port = 0;
+ ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, (struct sockaddr
*)&dst->ip6, sizeof(dst->ip6));
+ tmpdest->ip6.sin6_port = tmpport;
+ close(s);
+
+ if (ret != sizeof(ip6pkt)) {
+ fprintf(stderr, "Failed sendto (%s)\n",
strerror(errno));
+ return -1;
+ }
+ break;
+
+ default:
+ fprintf(stderr, "Not an ipv4/v6 address\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void usage(void)
+{
+ printf("Usage: ./tickle_tcp <remote_ip:port> <local_ip:port>\n");
+ exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+ sock_addr src, dst;
+
+ if (argc < 3) {
+ usage();
+ }
+
+ if (parse_ip_port(argv[1], &dst)) {
+ fprintf(stderr, "Bad IP:port '%s'\n", argv[1]);
+ return -1;
+ }
+ if (parse_ip_port(argv[2], &src)) {
+ fprintf(stderr, "Bad IP:port '%s'\n", argv[2]);
+ return -1;
+ }
+
+ if (send_tickle_ack(&dst, &src, 0, 0, 0)) {
+ fprintf(stderr, "Error while sending tickle ack\n");
+ return -1;
+ }
+
+ return 0;
+}
Index: resource-agents/heartbeat/IPaddr2
===================================================================
--- resource-agents.orig/heartbeat/IPaddr2
+++ resource-agents/heartbeat/IPaddr2
@@ -56,6 +56,7 @@
# OCF_RESKEY_arp_count
# OCF_RESKEY_arp_bg
# OCF_RESKEY_arp_mac
+# OCF_RESKEY_tickle_dir
#
# OCF_RESKEY_CRM_meta_clone
# OCF_RESKEY_CRM_meta_clone_max
@@ -68,6 +69,7 @@
SENDARP=$HA_BIN/send_arp
FINDIF=$HA_BIN/findif
+TICKLETCP=$HA_BIN/tickle_tcp
VLDIR=$HA_RSCTMP/IPaddr
SENDARPPIDDIR=$HA_RSCTMP/send_arp
CIP_lockfile=$HA_RSCTMP/IPaddr2-CIP-${OCF_RESKEY_ip}
@@ -220,6 +222,14 @@ You really shouldn't be touching this.
<content type="string" default="ffffffffffff"/>
</parameter>
+<parameter name="tickle_dir">
+<longdesc lang="en">
+The directory which is used to store the established TCP connections.
+</longdesc>
+<shortdesc lang="en">Tickle directory</shortdesc>
+<content type="string" default=""/>
+</parameter>
+
</parameters>
<actions>
@@ -520,6 +530,27 @@ run_send_arp() {
esac
}
+save_tcp_connections() {
+ mydir=$OCF_RESKEY_tickle_dir/`hostname`
+ rm -f $mydir/*
+ netstat -tn |egrep
'^tcp[[:space:]]+[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9\.]+:[0-9]+.*ESTABLISHED'
|
+ awk '{print $4" "$5}' |
+ while read server client; do
+ ip=${server%:*}
+ echo $client $server >> $mydir/$ip
+ done
+}
+
+run_tickle_tcp() {
+ for f in $OCF_RESKEY_tickle_dir/*/$OCF_RESKEY_ip; do
+ [ -f $f ] && cat $f | while read client server; do
+ for i in `seq 1 3`; do
+ $TICKLETCP $client $server
+ done
+ done
+ done
+}
+
#
# Run ipoibarping to note peers about new Infiniband address
#
@@ -663,9 +694,15 @@ ip_start() {
run_send_ib_arp
;;
*)
- if [ -x $SENDARP ]; then
- run_send_arp
- fi
+ if [ -x $SENDARP ]; then
+ run_send_arp
+ fi
+
+ if [ -n "$OCF_RESKEY_tickle_dir" ]; then
+ mkdir -p $OCF_RESKEY_tickle_dir/`hostname`
+ echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle
+ run_tickle_tcp
+ fi
;;
esac
exit $OCF_SUCCESS
@@ -741,6 +778,7 @@ ip_monitor() {
local ip_status=`ip_served`
case $ip_status in
ok)
+ save_tcp_connections
return $OCF_SUCCESS
;;
partial|no)
_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/