On Thu, Jul 02, 2009 at 03:23:04PM +0200, Lars Ellenberg wrote:
> On Thu, Jul 02, 2009 at 11:30:25AM +0300, Amir Vadai wrote:
> > Please attach the perl script to reproduce and I will check it.
> > As to the second problem - I did notice such behavior but couldn't
> > find a scenario to reproduce it. I guess it happen when the socket is
> > closed due to error.
> >
> > Tell me if you notice any message in dmesg.
>
> My former test cluster has been reassigned.
> The new test hardware will be available earliest tomorrow.
> But I don't think there was anything relevant in dmesg.
>
> I'll try to reproduce on the new test hardware then,
> and will get back to you as soon as possible.
ok.
finally new test hardware working.
this is on debian lenny,
userland ofed from http://pkg-ofed.alioth.debian.org/apt/ofed/
kernel git://git.openfabrics.org/ofed_1_4/linux-2.6.git
merged with upstream stable
git://git4.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.27.y.git
both as of today:
ofed_kernel 08acda8 sdp: Fix memory leak in bzcopy
linux-v2.6.27.y/master 49cbf40 Linux 2.6.27.26
kernel config: very many "kernel debugging" things enabled.
if you want me to try a certain .config, or anything,
this can be arranged.
two very ugly perl scripts attached,
one tcp server,
one tcp client,
adapted from the perlipc man page.
client connects,
sends a package,
receives a package
in an endless loop.
package format:
4 byte magic, 2 byte ignored, 2 byte payload length
indicated length of payload, all same bytes,
but the trailing 4 byte, which again is a magic number.
with ethernet, or IPoIB: runs endless.
with LD_PRELOAD=libsdp.so runs for very few iterations,
and errors out on one of the sanity checks.
sample output:
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:08 my_client.pl 4300: recv hdr [25]: invalid magic: 32 30 30
39 2d 30 37 2d
which hapeens to be the hexdump of the string "2009-07-".
where did it copy_user() that from? wtf?
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:08 my_client.pl 4301: recv payload [35]: expected 12296, but
received 12295 byte; last bytes received: 55 e4 e3 e2
^^^^ pid, ^^ seq number.
so for only 35 ping/pongs it did work ok.
exactly: one byte too short.
the trailing magic expected is e4 e3 e1 e1
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:09 my_client.pl 4302: recv payload [33]: expected 4131, but
received 4130 byte; last bytes received: 55 e4 e3 e2
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:10 my_client.pl 4303: recv payload [29]: expected 16401, but
received 16400 byte; last bytes received: 55 e4 e3 e2
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:12 my_client.pl 4304: recv payload [21]: expected 4110, but
received 4109 byte; last bytes received: 55 e4 e3 e2
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:13 my_client.pl 4305: recv payload [4]: expected 20495, but
received 20494 byte; last bytes received: 55 e4 e3 e2
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl
rum-ib0
2009-07-07 19:07:14 my_client.pl 4306: recv payload [12]: expected 22530, but
received 22529 byte; last bytes received: 55 e4 e3 e2
any suggestions how to proceed from here?
--
: Lars Ellenberg
: LINBIT | Your Way to High Availability
: DRBD/HA support and consulting http://www.linbit.com
DRBD® and LINBIT® are registered trademarks of LINBIT, Austria.
#!/usr/bin/perl -w
# adapted from perlipc(1)
use strict;
use Socket;
use POSIX 'strftime';
my ($remote,$port, $iaddr, $paddr, $proto, $line);
my $seq_nr = 0;
$remote = shift || 'localhost';
$port = shift || 2345; # random port
if ($port =~ /\D/) { $port = getservbyname($port, 'tcp') }
die "No port" unless $port;
$iaddr = inet_aton($remote) || die "no host: $remote";
$paddr = sockaddr_in($port, $iaddr);
$proto = getprotobyname('tcp');
socket(SOCK, PF_INET, SOCK_STREAM, $proto) || die "socket: $!";
connect(SOCK, $paddr) || die "connect: $!";
exit 1 if ping_pong_with_guard_bytes();
close (SOCK) || die "close: $!";
exit;
sub logmsg { print strftime("%F %T", localtime), " $0 $$: @_\n" }
sub random_packet()
{
# TODO: add checksum
my $p_len = int(rand(33000));
pack("Nnn", 0xb1b2b3b4, 0x3333, $p_len + 4) .
("\x55" x $p_len) . pack("N", 0xe4e3e2e1);
}
sub receive_and_validate_packet {
my ($hdr, $payload, $magic, $cmd, $len);
$hdr = "\xff\xee\xdd\xcc" x 1024; # stupid; just to see where the corruption is from.
if (!defined(recv SOCK, $hdr, 8, MSG_WAITALL)) {
logmsg "recv hdr error [$seq_nr]: $!";
return 1;
}
if (length($hdr) != 8) {
logmsg "recv hdr [$seq_nr]: too short: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
return 1;
}
($magic, $cmd, $len) = unpack "Nnn", $hdr;
if ($magic != 0xb1b2b3b4) {
logmsg "recv hdr [$seq_nr]: invalid magic: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
return 1;
}
if (!defined(recv SOCK, $payload, $len, MSG_WAITALL)) {
logmsg "recv payload error [$seq_nr]: $!";
return 1;
}
my $plen = length($payload);
my $p_last_4 = substr($payload, -4, 4);
my $p_trail_magic = unpack "N", $p_last_4;
if ($plen != $len) {
logmsg "recv payload [$seq_nr]: expected $len, but received $plen byte; last bytes received: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
return 1;
}
if ($p_trail_magic != 0xe4e3e2e1) {
logmsg "recv payload [$seq_nr]: invalid trailing magic; last bytes received: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
return 1;
}
++$seq_nr;
return 0;
}
sub send_random_packet() {
my $p = random_packet();
my $p_len = length($p);
# this is supposedly a blocking send,
# so it should send the whole packet!
my $s_len = send SOCK, $p, 0;
if (!defined($s_len)) {
logmsg "send error: $!";
return 1;
}
if ($s_len != $p_len) {
logmsg "send error: wanted to send $p_len, but could only send $s_len bytes";
return 1;
}
}
sub ping_pong_with_guard_bytes {
my $ret;
while (1) {
$ret = send_random_packet();
return $ret if $ret;
$ret = receive_and_validate_packet();
return $ret if $ret;
}
}
#!/usr/bin/perl -Tw
# adapted from perlipc(1)
use strict;
use POSIX 'strftime';
use Socket;
use Carp;
sub spawn; # forward declaration
sub logmsg { print strftime("%F %T", localtime), " $0 $$: @_\n" }
my $port = shift || 2345;
my $proto = getprotobyname('tcp');
($port) = $port =~ /^(\d+)$/ or die "invalid port";
socket(Server, PF_INET, SOCK_STREAM, $proto) || die "socket: $!";
setsockopt(Server, SOL_SOCKET, SO_REUSEADDR,
pack("l", 1)) || die "setsockopt: $!";
bind(Server, sockaddr_in($port, INADDR_ANY)) || die "bind: $!";
listen(Server,SOMAXCONN) || die "listen: $!";
logmsg "server started on port $port";
my $waitedpid = 0;
my $paddr;
use POSIX ":sys_wait_h";
use Errno;
sub REAPER {
local $!; # don't let waitpid() overwrite current error
while ((my $pid = waitpid(-1,WNOHANG)) > 0 && WIFEXITED($?)) {
logmsg "reaped $waitedpid" . ($? ? " with exit $?" : '');
}
$SIG{CHLD} = \&REAPER; # loathe sysV
}
$SIG{CHLD} = \&REAPER;
my $seq_nr;
while(1) {
$paddr = accept(Client, Server) || do {
# try again if accept() returned because a signal was received
next if $!{EINTR};
die "accept: $!";
};
my ($port, $iaddr) = sockaddr_in($paddr);
my $name = gethostbyaddr($iaddr, AF_INET);
logmsg "connection from $name [", inet_ntoa($iaddr), "] at port $port";
spawn \&pong_ping_with_guard_bytes;
close Client;
}
sub spawn {
my $coderef = shift;
unless (@_ == 0 && $coderef && ref($coderef) eq 'CODE') {
confess "usage: spawn CODEREF";
}
my $pid;
if (! defined($pid = fork)) {
logmsg "cannot fork: $!";
return;
}
elsif ($pid) {
logmsg "begat $pid";
return; # I'm the parent
}
# else I'm the child -- go spawn
$seq_nr = 0;
open(STDIN, "<&Client") || die "can't dup client to stdin";
open(STDOUT, ">&Client") || die "can't dup client to stdout";
## open(STDERR, ">&STDOUT") || die "can't dup stdout to stderr";
exit &$coderef();
}
sub random_packet()
{
# TODO: add checksum
my $p_len = int(rand(33000));
pack("Nnn", 0xb1b2b3b4, 0x3333, $p_len + 4) .
("\x55" x $p_len) . pack("N", 0xe4e3e2e1);
}
sub receive_and_validate_packet {
my ($hdr, $payload, $magic, $cmd, $len);
$hdr = "\xbb\xaa\x99\x88" x 1024; # stupid; just to see where the corruption is from.
if (!defined(recv STDIN, $hdr, 8, MSG_WAITALL)) {
logmsg "recv hdr error [$seq_nr]: $!";
return 1;
}
if (length($hdr) != 8) {
logmsg "recv hdr [$seq_nr]: too short: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
return 1;
}
($magic, $cmd, $len) = unpack "Nnn", $hdr;
if ($magic != 0xb1b2b3b4) {
logmsg "recv hdr [$seq_nr]: invalid magic: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
return 1;
}
if (!defined(recv STDIN, $payload, $len, MSG_WAITALL)) {
logmsg "recv payload error [$seq_nr]: $!";
return 1;
}
my $plen = length($payload);
my $p_last_4 = substr($payload, -4, 4);
my $p_trail_magic = unpack "N", $p_last_4;
if ($plen != $len) {
logmsg "recv payload [$seq_nr]: expected $len, but received $plen byte; last bytes received: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
return 1;
}
if ($p_trail_magic != 0xe4e3e2e1) {
logmsg "recv payload [$seq_nr]: invalid trailing magic; last bytes received: ",
join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
return 1;
}
++$seq_nr;
return 0;
}
sub send_random_packet() {
my $p = random_packet();
my $p_len = length($p);
# this is supposedly a blocking send,
# so it should send the whole packet!
my $s_len = send STDOUT, $p, 0;
if (!defined($s_len)) {
logmsg "send error: $!";
return 1;
}
if ($s_len != $p_len) {
logmsg "send error: wanted to send $p_len, but could only send $s_len bytes";
return 1;
}
}
sub pong_ping_with_guard_bytes {
my $ret;
while (1) {
$ret = receive_and_validate_packet();
return $ret if $ret;
$ret = send_random_packet();
return $ret if $ret;
}
logmsg "received $seq_nr packets\n";
return 0;
}
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general