On Mon, Nov 16, 2009 at 07:47:34PM +0100, Lars Ellenberg wrote:
> On Mon, Nov 16, 2009 at 07:10:12PM +0100, Dejan Muhamedagic wrote:
> > Hi,
> > 
> > On Mon, Nov 02, 2009 at 12:07:59PM -0500, Sam Tran wrote:
> > > Hi All,
> > > 
> > > I followed the thread "Food for thought: add something like cutter to
> > > IPaddr2 (or portblock?) RA"
> > > (http://lists.linux-ha.org/pipermail/linux-ha-dev/2008-October/016196.html)
> > > with great interest.
> > > 
> > > I am working on a cluster of Master OpenLDAP servers using PaceMaker
> > > and OpenAIS. The problem I have lies in the replication between the
> > > master server that holds the IP address resource and a replica server.
> > > In the "refreshAndPersist" replication mode that is being used, the
> > > replica polls the master server for updates, then the connection
> > > between the replica and the master server is maintained, and the
> > > replica is waiting for subsequent updates from the master server. In
> > > the event of a failure of the initial master the new master is taking
> > > over the IP address resource, but doesn't know anything about the
> > > previous persist stage, therefore is not able to send new updates to
> > > the replica. An RST needs to be sent to the replica in order to
> > > terminate the existing session and force a polling retry from the
> > > replica, or the replica would wait for the session to time out.
> > > 
> > > I was wondering whether some work has been done as far as the
> > > implentation of the tickle ACK feature in IPaddr2 RA is concerned.
> > 
> > Not to my knowledge. It would obviously be a good feature. The
> > only thing which is not clear to me is who/how would
> > keep/maintain/synchronize the connections database
> 
> As a "best effort" sort of thing, you could do a "depth=X" monitoring
> action in the IPaddr2 RA, which would 
>  grep "ESTABLISHED" /proc/net/nf_conntrack |
>  dd conv=fsync of=/somewhere/on/DRBD/or/NFS/or/iSCSI
> 
> On stop, it may (optionally?) truncate that state file.
> 
> On start, it would (optionally?) check that state file,
> and send out "Tickle ACKs".
> 
> You will miss only those connections that have been
> established since the last "grep", i.e. since the last
> "monitor depth=X".  If you want more, use conntrackd.
> 
> Volunteers?
> 

This is a simple implementation of the tickle ACK feature in IPaddr2
RA. Basically the code is borrowed from ctdb.samba.org, but I haven't
tested it in Heartbeat/openAIS cluster environment yet, so it may not
work for now :)

Thanks,
Jiaju

---
Index: resource-agents/tools/tickle_tcp.c
===================================================================
--- /dev/null
+++ resource-agents/tools/tickle_tcp.c
@@ -0,0 +1,316 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#define discard_const(ptr) ((void *)((intptr_t)(ptr)))
+
+typedef union {
+       struct sockaddr     sa;
+       struct sockaddr_in  ip;
+       struct sockaddr_in6 ip6;
+} sock_addr;
+
+uint32_t uint16_checksum(uint16_t *data, size_t n)
+{
+       uint32_t sum=0;
+       while (n >= 2) {
+               sum += (uint32_t)ntohs(*data);
+               data++;        
+               n -= 2;
+       }                      
+       if (n == 1) {
+               sum += (uint32_t)ntohs(*(uint8_t *)data);
+       }
+       return sum;
+}       
+
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
+{
+       uint32_t sum = uint16_checksum(data, n);
+       uint16_t sum2;
+       sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
+                               sizeof(ip->saddr));
+       sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
+                               sizeof(ip->daddr));
+       sum += ip->protocol + n;
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
+{
+       uint32_t phdr[2];
+       uint32_t sum = 0;
+       uint16_t sum2;
+
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
+
+       phdr[0] = htonl(n);
+       phdr[1] = htonl(ip6->ip6_nxt);
+       sum += uint16_checksum((uint16_t *)phdr, 8);
+
+       sum += uint16_checksum(data, n);
+
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+void set_nonblocking(int fd)
+{
+       unsigned v;
+       v = fcntl(fd, F_GETFL, 0);
+       fcntl(fd, F_SETFL, v | O_NONBLOCK);
+}
+
+void set_close_on_exec(int fd) 
+{               
+       unsigned v;
+       v = fcntl(fd, F_GETFD, 0);
+       fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+}
+
+static int parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin)
+{
+       sin->sin_family = AF_INET;
+       sin->sin_port   = htons(port);
+
+       if (inet_pton(AF_INET, s, &sin->sin_addr) != 1) {
+               fprintf(stderr, "Failed to translate %s into sin_addr\n", s);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int parse_ipv6(const char *s, const char *iface, unsigned port, 
sock_addr *saddr)
+{
+       saddr->ip6.sin6_family   = AF_INET6;
+       saddr->ip6.sin6_port     = htons(port);
+       saddr->ip6.sin6_flowinfo = 0;
+       saddr->ip6.sin6_scope_id = 0;
+
+       if (inet_pton(AF_INET6, s, &saddr->ip6.sin6_addr) != 1) {
+               fprintf(stderr, "Failed to translate %s into sin6_addr\n", s);
+               return -1;
+       }
+
+       if (iface && IN6_IS_ADDR_LINKLOCAL(&saddr->ip6.sin6_addr)) {
+               saddr->ip6.sin6_scope_id = if_nametoindex(iface);
+       }
+
+        return 0;
+}
+
+int parse_ip(const char *addr, const char *iface, unsigned port, sock_addr 
*saddr)
+{
+       char *p;
+       int ret;
+
+       p = index(addr, ':');
+       if (!p)
+               ret = parse_ipv4(addr, port, &saddr->ip);
+       else
+               ret = parse_ipv6(addr, iface, port, saddr);
+
+       return ret;
+}
+
+int parse_ip_port(const char *addr, sock_addr *saddr)
+{
+       char *s, *p;
+       unsigned port;
+       char *endp = NULL;
+       int ret;
+
+       s = strdup(addr);
+       if (!s) {
+               fprintf(stderr, "Failed strdup()\n");
+               return -1;
+       }
+
+       p = rindex(s, ':');
+       if (!p) {
+               fprintf(stderr, "This addr: %s does not contain a port 
number\n", s);
+               free(s);
+               return -1;
+       }
+       
+       port = strtoul(p+1, &endp, 10);
+       if (!endp || *endp != 0) {
+               fprintf(stderr, "Trailing garbage after the port in %s\n", s);
+               free(s);
+               return -1;
+       }
+       *p = 0;
+
+       ret = parse_ip(s, NULL, port, saddr);
+       free(s);
+       return ret;
+}
+
+int send_tickle_ack(const sock_addr *dst, 
+                   const sock_addr *src, 
+                   uint32_t seq, uint32_t ack, int rst)
+{
+       int s;
+       int ret;
+       uint32_t one = 1;
+       uint16_t tmpport;
+       sock_addr *tmpdest;
+       struct {
+               struct iphdr ip;
+               struct tcphdr tcp;
+       } ip4pkt;
+       struct {
+               struct ip6_hdr ip6;
+               struct tcphdr tcp;
+       } ip6pkt;
+
+       switch (src->ip.sin_family) {
+       case AF_INET:
+               memset(&ip4pkt, 0, sizeof(ip4pkt));
+               ip4pkt.ip.version  = 4;
+               ip4pkt.ip.ihl      = sizeof(ip4pkt.ip)/4;
+               ip4pkt.ip.tot_len  = htons(sizeof(ip4pkt));
+               ip4pkt.ip.ttl      = 255;
+               ip4pkt.ip.protocol = IPPROTO_TCP;
+               ip4pkt.ip.saddr    = src->ip.sin_addr.s_addr;
+               ip4pkt.ip.daddr    = dst->ip.sin_addr.s_addr;
+               ip4pkt.ip.check    = 0;
+
+               ip4pkt.tcp.source  = src->ip.sin_port;
+               ip4pkt.tcp.dest    = dst->ip.sin_port;
+               ip4pkt.tcp.seq     = seq;
+               ip4pkt.tcp.ack_seq = ack;
+               ip4pkt.tcp.ack     = 1;
+               if (rst)
+                       ip4pkt.tcp.rst = 1;
+               ip4pkt.tcp.doff    = sizeof(ip4pkt.tcp)/4;
+               ip4pkt.tcp.window   = htons(1234);
+               ip4pkt.tcp.check    = tcp_checksum((uint16_t *)&ip4pkt.tcp, 
sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+               s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+               if (s == -1) {
+                       fprintf(stderr, "Failed to open raw socket (%s)\n", 
strerror(errno));
+                       return -1;
+               }
+
+               ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
+               if (ret != 0) {
+                       fprintf(stderr, "Failed to setup IP headers (%s)\n", 
strerror(errno));
+                       close(s);
+                       return -1;
+               }
+
+               set_nonblocking(s);
+               set_close_on_exec(s);
+
+               ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, 
+                            (struct sockaddr *)&dst->ip, sizeof(dst->ip));
+               close(s);
+               if (ret != sizeof(ip4pkt)) {
+                       fprintf(stderr, "Failed sendto (%s)\n", 
strerror(errno));
+                       return -1;
+               }
+               break;
+
+        case AF_INET6:
+               memset(&ip6pkt, 0, sizeof(ip6pkt));
+               ip6pkt.ip6.ip6_vfc  = 0x60;
+               ip6pkt.ip6.ip6_plen = htons(20);
+               ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
+               ip6pkt.ip6.ip6_hlim = 64;
+               ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
+               ip6pkt.ip6.ip6_dst  = dst->ip6.sin6_addr;
+
+               ip6pkt.tcp.source   = src->ip6.sin6_port;
+               ip6pkt.tcp.dest     = dst->ip6.sin6_port;
+               ip6pkt.tcp.seq      = seq;
+               ip6pkt.tcp.ack_seq  = ack;
+               ip6pkt.tcp.ack      = 1;
+               if (rst)
+                       ip6pkt.tcp.rst      = 1;
+                ip6pkt.tcp.doff     = sizeof(ip6pkt.tcp)/4;
+               ip6pkt.tcp.window   = htons(1234);
+               ip6pkt.tcp.check    = tcp_checksum6((uint16_t *)&ip6pkt.tcp, 
sizeof(ip6pkt.tcp), &ip6pkt.ip6);
+
+               s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
+               if (s == -1) {
+                       fprintf(stderr, "Failed to open sending socket\n");
+                       return -1;
+                }
+
+               tmpdest = discard_const(dst);
+               tmpport = tmpdest->ip6.sin6_port;
+
+               tmpdest->ip6.sin6_port = 0;
+               ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, (struct sockaddr 
*)&dst->ip6, sizeof(dst->ip6));
+               tmpdest->ip6.sin6_port = tmpport;
+               close(s);
+
+               if (ret != sizeof(ip6pkt)) {
+                       fprintf(stderr, "Failed sendto (%s)\n", 
strerror(errno));
+                       return -1;
+               }
+               break;
+
+       default:
+               fprintf(stderr, "Not an ipv4/v6 address\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+static void usage(void)
+{
+       printf("Usage: ./tickle_tcp <remote_ip:port> <local_ip:port>\n");
+       exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+       int ret;
+       sock_addr src, dst;
+       
+       if (argc < 3) {
+               usage();
+       }       
+
+       if (parse_ip_port(argv[1], &dst)) {
+               fprintf(stderr, "Bad IP:port '%s'\n", argv[1]);
+               return -1;
+       }
+       if (parse_ip_port(argv[2], &src)) {
+               fprintf(stderr, "Bad IP:port '%s'\n", argv[2]);
+               return -1;
+       }
+       
+       if (send_tickle_ack(&dst, &src, 0, 0, 0)) {
+               fprintf(stderr, "Error while sending tickle ack\n");
+               return -1;
+       }
+
+       return 0;
+}
Index: resource-agents/heartbeat/IPaddr2
===================================================================
--- resource-agents.orig/heartbeat/IPaddr2
+++ resource-agents/heartbeat/IPaddr2
@@ -56,6 +56,7 @@
 #      OCF_RESKEY_arp_count
 #      OCF_RESKEY_arp_bg
 #      OCF_RESKEY_arp_mac
+#      OCF_RESKEY_tickle_dir
 #
 #      OCF_RESKEY_CRM_meta_clone
 #      OCF_RESKEY_CRM_meta_clone_max
@@ -68,6 +69,7 @@
 
 SENDARP=$HA_BIN/send_arp
 FINDIF=$HA_BIN/findif
+TICKLETCP=$HA_BIN/tickle_tcp
 VLDIR=$HA_RSCTMP/IPaddr
 SENDARPPIDDIR=$HA_RSCTMP/send_arp
 CIP_lockfile=$HA_RSCTMP/IPaddr2-CIP-${OCF_RESKEY_ip}
@@ -220,6 +222,14 @@ You really shouldn't be touching this.
 <content type="string" default="ffffffffffff"/>
 </parameter>
 
+<parameter name="tickle_dir">
+<longdesc lang="en">
+The directory which is used to store the established TCP connections.
+</longdesc>
+<shortdesc lang="en">Tickle directory</shortdesc>
+<content type="string" default=""/>
+</parameter>
+
 </parameters>
 
 <actions>
@@ -520,6 +530,27 @@ run_send_arp() {
        esac
 }
 
+save_tcp_connections() {
+       mydir=$OCF_RESKEY_tickle_dir/`hostname`
+       rm -f $mydir/*
+       netstat -tn |egrep 
'^tcp[[:space:]]+[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9\.]+:[0-9]+.*ESTABLISHED'
 |
+               awk '{print $4" "$5}' |
+               while read server client; do
+                       ip=${server%:*}
+                       echo $client $server >> $mydir/$ip
+               done
+}
+
+run_tickle_tcp() {
+       for f in $OCF_RESKEY_tickle_dir/*/$OCF_RESKEY_ip; do
+               [ -f $f ] && cat $f | while read client server; do
+                       for i in `seq 1 3`; do
+                               $TICKLETCP $client $server
+                       done
+               done
+       done    
+}
+
 #
 # Run ipoibarping to note peers about new Infiniband address
 #
@@ -663,9 +694,15 @@ ip_start() {
                run_send_ib_arp
                ;;
        *)
-           if [ -x $SENDARP ]; then
-               run_send_arp
-           fi
+               if [ -x $SENDARP ]; then
+                       run_send_arp
+               fi
+
+               if [ -n "$OCF_RESKEY_tickle_dir" ]; then
+                       mkdir -p $OCF_RESKEY_tickle_dir/`hostname`
+                       echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle
+                       run_tickle_tcp
+               fi      
                ;;
        esac
        exit $OCF_SUCCESS
@@ -741,6 +778,7 @@ ip_monitor() {
        local ip_status=`ip_served`
        case $ip_status in
        ok)
+               save_tcp_connections
                return $OCF_SUCCESS
                ;;
        partial|no)
_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

Reply via email to