On Mon, Nov 16, 2009 at 07:47:34PM +0100, Lars Ellenberg wrote: > On Mon, Nov 16, 2009 at 07:10:12PM +0100, Dejan Muhamedagic wrote: > > Hi, > > > > On Mon, Nov 02, 2009 at 12:07:59PM -0500, Sam Tran wrote: > > > Hi All, > > > > > > I followed the thread "Food for thought: add something like cutter to > > > IPaddr2 (or portblock?) RA" > > > (http://lists.linux-ha.org/pipermail/linux-ha-dev/2008-October/016196.html) > > > with great interest. > > > > > > I am working on a cluster of Master OpenLDAP servers using PaceMaker > > > and OpenAIS. The problem I have lies in the replication between the > > > master server that holds the IP address resource and a replica server. > > > In the "refreshAndPersist" replication mode that is being used, the > > > replica polls the master server for updates, then the connection > > > between the replica and the master server is maintained, and the > > > replica is waiting for subsequent updates from the master server. In > > > the event of a failure of the initial master the new master is taking > > > over the IP address resource, but doesn't know anything about the > > > previous persist stage, therefore is not able to send new updates to > > > the replica. An RST needs to be sent to the replica in order to > > > terminate the existing session and force a polling retry from the > > > replica, or the replica would wait for the session to time out. > > > > > > I was wondering whether some work has been done as far as the > > > implentation of the tickle ACK feature in IPaddr2 RA is concerned. > > > > Not to my knowledge. It would obviously be a good feature. The > > only thing which is not clear to me is who/how would > > keep/maintain/synchronize the connections database > > As a "best effort" sort of thing, you could do a "depth=X" monitoring > action in the IPaddr2 RA, which would > grep "ESTABLISHED" /proc/net/nf_conntrack | > dd conv=fsync of=/somewhere/on/DRBD/or/NFS/or/iSCSI > > On stop, it may (optionally?) truncate that state file. > > On start, it would (optionally?) check that state file, > and send out "Tickle ACKs". > > You will miss only those connections that have been > established since the last "grep", i.e. since the last > "monitor depth=X". If you want more, use conntrackd. > > Volunteers? >
This is a simple implementation of the tickle ACK feature in IPaddr2 RA. Basically the code is borrowed from ctdb.samba.org, but I haven't tested it in Heartbeat/openAIS cluster environment yet, so it may not work for now :) Thanks, Jiaju --- Index: resource-agents/tools/tickle_tcp.c =================================================================== --- /dev/null +++ resource-agents/tools/tickle_tcp.c @@ -0,0 +1,316 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <sys/types.h> +#include <sys/socket.h> + +#define discard_const(ptr) ((void *)((intptr_t)(ptr))) + +typedef union { + struct sockaddr sa; + struct sockaddr_in ip; + struct sockaddr_in6 ip6; +} sock_addr; + +uint32_t uint16_checksum(uint16_t *data, size_t n) +{ + uint32_t sum=0; + while (n >= 2) { + sum += (uint32_t)ntohs(*data); + data++; + n -= 2; + } + if (n == 1) { + sum += (uint32_t)ntohs(*(uint8_t *)data); + } + return sum; +} + +static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip) +{ + uint32_t sum = uint16_checksum(data, n); + uint16_t sum2; + sum += uint16_checksum((uint16_t *)(void *)&ip->saddr, + sizeof(ip->saddr)); + sum += uint16_checksum((uint16_t *)(void *)&ip->daddr, + sizeof(ip->daddr)); + sum += ip->protocol + n; + sum = (sum & 0xFFFF) + (sum >> 16); + sum = (sum & 0xFFFF) + (sum >> 16); + sum2 = htons(sum); + sum2 = ~sum2; + if (sum2 == 0) { + return 0xFFFF; + } + return sum2; +} + +static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6) +{ + uint32_t phdr[2]; + uint32_t sum = 0; + uint16_t sum2; + + sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16); + sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16); + + phdr[0] = htonl(n); + phdr[1] = htonl(ip6->ip6_nxt); + sum += uint16_checksum((uint16_t *)phdr, 8); + + sum += uint16_checksum(data, n); + + sum = (sum & 0xFFFF) + (sum >> 16); + sum = (sum & 0xFFFF) + (sum >> 16); + sum2 = htons(sum); + sum2 = ~sum2; + if (sum2 == 0) { + return 0xFFFF; + } + return sum2; +} + +void set_nonblocking(int fd) +{ + unsigned v; + v = fcntl(fd, F_GETFL, 0); + fcntl(fd, F_SETFL, v | O_NONBLOCK); +} + +void set_close_on_exec(int fd) +{ + unsigned v; + v = fcntl(fd, F_GETFD, 0); + fcntl(fd, F_SETFD, v | FD_CLOEXEC); +} + +static int parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin) +{ + sin->sin_family = AF_INET; + sin->sin_port = htons(port); + + if (inet_pton(AF_INET, s, &sin->sin_addr) != 1) { + fprintf(stderr, "Failed to translate %s into sin_addr\n", s); + return -1; + } + + return 0; +} + +static int parse_ipv6(const char *s, const char *iface, unsigned port, sock_addr *saddr) +{ + saddr->ip6.sin6_family = AF_INET6; + saddr->ip6.sin6_port = htons(port); + saddr->ip6.sin6_flowinfo = 0; + saddr->ip6.sin6_scope_id = 0; + + if (inet_pton(AF_INET6, s, &saddr->ip6.sin6_addr) != 1) { + fprintf(stderr, "Failed to translate %s into sin6_addr\n", s); + return -1; + } + + if (iface && IN6_IS_ADDR_LINKLOCAL(&saddr->ip6.sin6_addr)) { + saddr->ip6.sin6_scope_id = if_nametoindex(iface); + } + + return 0; +} + +int parse_ip(const char *addr, const char *iface, unsigned port, sock_addr *saddr) +{ + char *p; + int ret; + + p = index(addr, ':'); + if (!p) + ret = parse_ipv4(addr, port, &saddr->ip); + else + ret = parse_ipv6(addr, iface, port, saddr); + + return ret; +} + +int parse_ip_port(const char *addr, sock_addr *saddr) +{ + char *s, *p; + unsigned port; + char *endp = NULL; + int ret; + + s = strdup(addr); + if (!s) { + fprintf(stderr, "Failed strdup()\n"); + return -1; + } + + p = rindex(s, ':'); + if (!p) { + fprintf(stderr, "This addr: %s does not contain a port number\n", s); + free(s); + return -1; + } + + port = strtoul(p+1, &endp, 10); + if (!endp || *endp != 0) { + fprintf(stderr, "Trailing garbage after the port in %s\n", s); + free(s); + return -1; + } + *p = 0; + + ret = parse_ip(s, NULL, port, saddr); + free(s); + return ret; +} + +int send_tickle_ack(const sock_addr *dst, + const sock_addr *src, + uint32_t seq, uint32_t ack, int rst) +{ + int s; + int ret; + uint32_t one = 1; + uint16_t tmpport; + sock_addr *tmpdest; + struct { + struct iphdr ip; + struct tcphdr tcp; + } ip4pkt; + struct { + struct ip6_hdr ip6; + struct tcphdr tcp; + } ip6pkt; + + switch (src->ip.sin_family) { + case AF_INET: + memset(&ip4pkt, 0, sizeof(ip4pkt)); + ip4pkt.ip.version = 4; + ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4; + ip4pkt.ip.tot_len = htons(sizeof(ip4pkt)); + ip4pkt.ip.ttl = 255; + ip4pkt.ip.protocol = IPPROTO_TCP; + ip4pkt.ip.saddr = src->ip.sin_addr.s_addr; + ip4pkt.ip.daddr = dst->ip.sin_addr.s_addr; + ip4pkt.ip.check = 0; + + ip4pkt.tcp.source = src->ip.sin_port; + ip4pkt.tcp.dest = dst->ip.sin_port; + ip4pkt.tcp.seq = seq; + ip4pkt.tcp.ack_seq = ack; + ip4pkt.tcp.ack = 1; + if (rst) + ip4pkt.tcp.rst = 1; + ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4; + ip4pkt.tcp.window = htons(1234); + ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip); + + s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW)); + if (s == -1) { + fprintf(stderr, "Failed to open raw socket (%s)\n", strerror(errno)); + return -1; + } + + ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one)); + if (ret != 0) { + fprintf(stderr, "Failed to setup IP headers (%s)\n", strerror(errno)); + close(s); + return -1; + } + + set_nonblocking(s); + set_close_on_exec(s); + + ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, + (struct sockaddr *)&dst->ip, sizeof(dst->ip)); + close(s); + if (ret != sizeof(ip4pkt)) { + fprintf(stderr, "Failed sendto (%s)\n", strerror(errno)); + return -1; + } + break; + + case AF_INET6: + memset(&ip6pkt, 0, sizeof(ip6pkt)); + ip6pkt.ip6.ip6_vfc = 0x60; + ip6pkt.ip6.ip6_plen = htons(20); + ip6pkt.ip6.ip6_nxt = IPPROTO_TCP; + ip6pkt.ip6.ip6_hlim = 64; + ip6pkt.ip6.ip6_src = src->ip6.sin6_addr; + ip6pkt.ip6.ip6_dst = dst->ip6.sin6_addr; + + ip6pkt.tcp.source = src->ip6.sin6_port; + ip6pkt.tcp.dest = dst->ip6.sin6_port; + ip6pkt.tcp.seq = seq; + ip6pkt.tcp.ack_seq = ack; + ip6pkt.tcp.ack = 1; + if (rst) + ip6pkt.tcp.rst = 1; + ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4; + ip6pkt.tcp.window = htons(1234); + ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6); + + s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW); + if (s == -1) { + fprintf(stderr, "Failed to open sending socket\n"); + return -1; + } + + tmpdest = discard_const(dst); + tmpport = tmpdest->ip6.sin6_port; + + tmpdest->ip6.sin6_port = 0; + ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, (struct sockaddr *)&dst->ip6, sizeof(dst->ip6)); + tmpdest->ip6.sin6_port = tmpport; + close(s); + + if (ret != sizeof(ip6pkt)) { + fprintf(stderr, "Failed sendto (%s)\n", strerror(errno)); + return -1; + } + break; + + default: + fprintf(stderr, "Not an ipv4/v6 address\n"); + return -1; + } + + return 0; +} + +static void usage(void) +{ + printf("Usage: ./tickle_tcp <remote_ip:port> <local_ip:port>\n"); + exit(1); +} + +int main(int argc, char *argv[]) +{ + int ret; + sock_addr src, dst; + + if (argc < 3) { + usage(); + } + + if (parse_ip_port(argv[1], &dst)) { + fprintf(stderr, "Bad IP:port '%s'\n", argv[1]); + return -1; + } + if (parse_ip_port(argv[2], &src)) { + fprintf(stderr, "Bad IP:port '%s'\n", argv[2]); + return -1; + } + + if (send_tickle_ack(&dst, &src, 0, 0, 0)) { + fprintf(stderr, "Error while sending tickle ack\n"); + return -1; + } + + return 0; +} Index: resource-agents/heartbeat/IPaddr2 =================================================================== --- resource-agents.orig/heartbeat/IPaddr2 +++ resource-agents/heartbeat/IPaddr2 @@ -56,6 +56,7 @@ # OCF_RESKEY_arp_count # OCF_RESKEY_arp_bg # OCF_RESKEY_arp_mac +# OCF_RESKEY_tickle_dir # # OCF_RESKEY_CRM_meta_clone # OCF_RESKEY_CRM_meta_clone_max @@ -68,6 +69,7 @@ SENDARP=$HA_BIN/send_arp FINDIF=$HA_BIN/findif +TICKLETCP=$HA_BIN/tickle_tcp VLDIR=$HA_RSCTMP/IPaddr SENDARPPIDDIR=$HA_RSCTMP/send_arp CIP_lockfile=$HA_RSCTMP/IPaddr2-CIP-${OCF_RESKEY_ip} @@ -220,6 +222,14 @@ You really shouldn't be touching this. <content type="string" default="ffffffffffff"/> </parameter> +<parameter name="tickle_dir"> +<longdesc lang="en"> +The directory which is used to store the established TCP connections. +</longdesc> +<shortdesc lang="en">Tickle directory</shortdesc> +<content type="string" default=""/> +</parameter> + </parameters> <actions> @@ -520,6 +530,27 @@ run_send_arp() { esac } +save_tcp_connections() { + mydir=$OCF_RESKEY_tickle_dir/`hostname` + rm -f $mydir/* + netstat -tn |egrep '^tcp[[:space:]]+[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9\.]+:[0-9]+.*ESTABLISHED' | + awk '{print $4" "$5}' | + while read server client; do + ip=${server%:*} + echo $client $server >> $mydir/$ip + done +} + +run_tickle_tcp() { + for f in $OCF_RESKEY_tickle_dir/*/$OCF_RESKEY_ip; do + [ -f $f ] && cat $f | while read client server; do + for i in `seq 1 3`; do + $TICKLETCP $client $server + done + done + done +} + # # Run ipoibarping to note peers about new Infiniband address # @@ -663,9 +694,15 @@ ip_start() { run_send_ib_arp ;; *) - if [ -x $SENDARP ]; then - run_send_arp - fi + if [ -x $SENDARP ]; then + run_send_arp + fi + + if [ -n "$OCF_RESKEY_tickle_dir" ]; then + mkdir -p $OCF_RESKEY_tickle_dir/`hostname` + echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle + run_tickle_tcp + fi ;; esac exit $OCF_SUCCESS @@ -741,6 +778,7 @@ ip_monitor() { local ip_status=`ip_served` case $ip_status in ok) + save_tcp_connections return $OCF_SUCCESS ;; partial|no) _______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/