The two separate test applications are:

omxtestserver.c:
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include "myriexpress.h"


int main(int argc, char *argv[]) {
  uint32_t ep_id, sid, result = 0;
  uint64_t nic_id;
  mx_return_t ret;
  mx_endpoint_t ep;
  mx_endpoint_addr_t epa;
  mx_request_t request;
  mx_status_t status;

  mx_init();

ret = mx_open_endpoint(0, 0, 0, NULL, 0, &ep); /* use the first NIC, open endpoint 0, filter = 0, no params */
  if (ret) {
    printf("open_endpoint() returned %s\n", mx_strerror(ret));
    exit(1);
  }

  while(1) {
    sleep(60);
  }

  mx_finalize();
  return 0;
}

omxtestclient.c:
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include "myriexpress.h"


int main(int argc, char *argv[]) {
  uint32_t ep_id, sid, result = 0;
  uint64_t nic_id_from_hostname;
  mx_return_t ret;
  mx_endpoint_t ep;
  mx_endpoint_addr_t epa;
  mx_request_t request;
  mx_status_t status;

  mx_init();

ret = mx_open_endpoint(0, 1, 0, NULL, 0, &ep); /* use the first NIC, open endpoint 1, filter = 0, no params */
  if (ret) {
    printf("open_endpoint() returned %s\n", mx_strerror(ret));
    exit(1);
  }

  ret = mx_hostname_to_nic_id("begbie:0", &nic_id_from_hostname);
  if (ret) {
    printf("mx_hostname_to_nic_id() returned %s\n", mx_strerror(ret));
    exit(1);
  }


ret = mx_iconnect(ep, nic_id_from_hostname, 0, 0, 0, NULL, &request);
  if (ret) {
    printf("iconnect() returned %s\n", mx_strerror(ret));
    exit(1);
  }

  do {
    ret = mx_test(ep, &request, &status, &result);
    if (result) {
printf("iconnect to nic_id from hostname completed with status %s\n", mx_strstatus(status.code));
    } else {
      printf("mx_test() returned %d\n", ret);
      sleep(1);
    }
  } while (!result);

  mx_finalize();
  return 0;
}


I have run it with full Open-MX debugging turned on:

server:

jrandall@begbie:/tmp$ OMX_VERBOSE=1 OMX_VERBDEBUG="PCSLMQRUEATWV" ./ omxtestserver
OMX: trying to open board [0,0] endpoint [0,0]
OMX: trying to open board #0 endpoint #0
OMX: successfully open board #0 endpoint #0
OMX: desc at 0x7f86bc984000 sendq at 0x7f86bb9a9000, recvq at 0x7f86bb1a9000, exp eventq at 0x7f86bc974000, unexp at 0x7f86bc95a000 OMX: Successfully attached endpoint #0 on board #0 (hostname 'begbie: 0', name 'eth3', addr 00:1b:21:4f:4d:5a)
OMX: created partner 0000001b214f4d5a ep 0 peer index 0
OMX: created myself partner 0000001b214f4d5a ep 0 peer index 0

client:

jrandall@begbie:/tmp$ OMX_VERBOSE=1 OMX_VERBDEBUG="PCSLMQRUEATWV" ./ omxtestclient
OMX: Forcing debugging signal to enabled (level 1)
OMX: Forcing connect polling all endpoints to enabled
OMX: created myself partner 0000001b214f4d5a ep 1 peer index 0
OMX: created partner 0000001b214f4d5a ep 0 peer index 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
... ad infinitum


I would note also that if I run the client without the server running, I get:

jrandall@begbie:/tmp$ OMX_VERBOSE=1 OMX_VERBDEBUG="PCSLMQRUEATWV" ./ omxtestclient
OMX: Forcing debugging signal to enabled (level 1)
OMX: Forcing connect polling all endpoints to enabled
OMX: trying to open board [0,0] endpoint [1,1]
OMX: trying to open board #0 endpoint #1
OMX: successfully open board #0 endpoint #1
OMX: desc at 0x7f767cb88000 sendq at 0x7f767bbad000, recvq at 0x7f767b3ad000, exp eventq at 0x7f767cb78000, unexp at 0x7f767cb5e000 OMX: Successfully attached endpoint #1 on board #0 (hostname 'begbie: 0', name 'eth3', addr 00:1b:21:4f:4d:5a)
OMX: created partner 0000001b214f4d5a ep 1 peer index 0
OMX: created myself partner 0000001b214f4d5a ep 1 peer index 0
OMX: created partner 0000001b214f4d5a ep 0 peer index 0
OMX: received type 25
OMX: got nack from partner 0000001b214f4d5a ep 0 for seqnum 0
OMX: Completing iconnect request: Remote Endpoint is Closed

which seems correct.


So, it seems like the Open-MX library can tell when the self endpoint is closed, but if it is not closed, the connection request does not appear to arrive at the destination library at all.

Open-MX has a facility (using the debugging library) to dump the status of all requests when sent the USR1 signal. While running both client and server, I signalled both processes to do so:

server:
jrandall@begbie:/tmp$ OMX_VERBOSE=1 OMX_VERBDEBUG="PCSLMQRUEATWV" OMX_DEBUG_SIGNAL=1 ./omxtestserver
OMX: Forcing debugging signal to enabled (level 1)
OMX: trying to open board [0,0] endpoint [0,0]
OMX: trying to open board #0 endpoint #0
OMX: successfully open board #0 endpoint #0
OMX: desc at 0x7ff384ac7000 sendq at 0x7ff383aec000, recvq at 0x7ff3832ec000, exp eventq at 0x7ff384ab7000, unexp at 0x7ff384a9d000 OMX: Successfully attached endpoint #0 on board #0 (hostname 'begbie: 0', name 'eth3', addr 00:1b:21:4f:4d:5a)
OMX: created partner 0000001b214f4d5a ep 0 peer index 0
OMX: created myself partner 0000001b214f4d5a ep 0 peer index 0
.Endpoint 0 on Board 0:
   Total 0 partners excluding myself
  Recv                  : 0 requests
  Unexpected            : 0 requests
  Done                  : 0 requests
  Missing resources     : 0 requests
  Driver mediumsq sending : 0 requests
  Non-acked             : 0 requests
  Partial medium recv   : 0 requests
  Internal done         : 0 requests
  Large send            : 0 requests
  Driver pulling        : 0 requests
  Connect               : 0 requests
  Non-acked             : 0 requests
  Unexpected self send  : 0 requests

client:

jrandall@begbie:/tmp$ OMX_VERBOSE=1 OMX_VERBDEBUG="PCSLMQRUEATWV" OMX_DEBUG_SIGNAL=1 ./omxtestclient
OMX: Forcing debugging signal to enabled (level 1)
OMX: Forcing connect polling all endpoints to enabled
OMX: trying to open board [0,0] endpoint [1,1]
OMX: trying to open board #0 endpoint #1
OMX: successfully open board #0 endpoint #1
OMX: desc at 0x7f5519443000 sendq at 0x7f5518468000, recvq at 0x7f5517c68000, exp eventq at 0x7f5519433000, unexp at 0x7f55194190
00
OMX: Successfully attached endpoint #1 on board #0 (hostname 'begbie: 0', name 'eth3', addr 00:1b:21:4f:4d:5a)
OMX: created partner 0000001b214f4d5a ep 1 peer index 0
OMX: created myself partner 0000001b214f4d5a ep 1 peer index 0
OMX: created partner 0000001b214f4d5a ep 0 peer index 0
mx_test() returned 0
mx_test() returned 0
...

mx_test() returned 0
mx_test() returned 0
OMX: No progression occured in the last 1 seconds (101 jiffies)
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
mx_test() returned 0
Endpoint 1 on Board 0:
  Partner addr 0000001b214f4d5a endpoint 0 index 0:
    Send session c000 next 16383 ack next 16383
    Recv session 0 next match 1 next frag 1 last acked 1
    Missing seqnum      : 0 requests
    Non-acked           : 0 requests
    Partial medium recv : 0 requests
    Connect             : 1 requests
    Early packets: 0 early packets
   Total 1 partners excluding myself
  Recv                  : 0 requests
  Unexpected            : 0 requests
  Done                  : 0 requests
  Missing resources     : 0 requests
  Driver mediumsq sending : 0 requests
  Non-acked             : 0 requests
  Partial medium recv   : 0 requests
  Internal done         : 0 requests
  Large send            : 0 requests
  Driver pulling        : 0 requests
  Connect               : 1 requests
  Non-acked             : 0 requests
  Unexpected self send  : 0 requests

mx_test() returned 0
mx_test() returned 0

So, it appears the client does still have a pending connect request, but the server library doesn't seem to have heard of it.

I have opened an Open-MX bug to track this issue (https://gforge.inria.fr/tracker/index.php?func=detail&aid=12719&group_id=889&atid=3614 )

Josh.

_______________________________________________
Pvfs2-users mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-users

Reply via email to