Hi,
Here is my patch, I have worked on kernel 2.6.18 on centos 5.2
I get some trouble while accessing struc sg_list, but this
implementention allow forwarding information from struct ib_send_wr in
function post_send (in mlx4/qp.c).
The program in userland is also attached but it only print
informations on console.
I will keep looking at this,
Regards,
Thierry
On Wed, Jun 24, 2009 at 7:55 PM, Jason
Gunthorpe<[email protected]> wrote:
> On Wed, Jun 24, 2009 at 06:55:43PM +0300, Moni Shoua wrote:
>
>> I believe that Jason and I still disagree but...
>
>> Jason suggests that I implement this feature with netlink. This
>> approach might have an advantage but if I understand it right this
>> approach requires a patch also to some user application in order to
>> take advantage of this patch.
>
> The kernel devs have made it clear that the preferred way to export
> this information is through netlink and/or a file in /proc/net/..
>
> You never got an answer if /proc/net is truely discouraged or not.
>
> debugfs is completely useless because it cannot be used for actual
> end-user interrogation and the rules are it should not be used as a
> stable kernel-user interface. (ie is NOT a new dumping ground like
> /proc/ was)
>
>> Also, I think that there is a value for a virtual text file under
>> debugfs to monitor rdma_cm connections easily and without any other
>> special app besides 'cat' (just like in IPoIB).
>
> If you have the netlink based program there really is no point in
> including something under debugfs. It is just bloat.
>
>> Finally, this implementation doesn't contradict netlink
>> implementation in the future (and it won't be the first time).
>
> As I said before, the stack has reached a level of maturity that new
> stuff going in should meaningfully 'move the ball forward' toward a
> mature and complete stack. In the context of state reporting that
> means a stable user interface that programs like lsof, netstat, etc,
> can rely upon.
>
> Putting something under debugfs clearly does nothing to advance that
> goal.
>
> Further, there is clearly alot of state information we should be
> exporting to userspace. RDMA-CM stuff is only a tiny portion. netlink
> is the kernel devs answer to all of this.
>
> Jason
>
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/hw/mlx4/qp.c linux-2.6.18.i686/drivers/infiniband/hw/mlx4/qp.c
--- linux-2.6.18.i686.ori/drivers/infiniband/hw/mlx4/qp.c 2009-06-25 14:31:17.000000000 +0200
+++ linux-2.6.18.i686/drivers/infiniband/hw/mlx4/qp.c 2009-06-08 10:14:32.000000000 +0200
@@ -39,6 +39,11 @@
#include "mlx4_ib.h"
#include "user.h"
+#include <linux/netlink_trap.h>
+
+extern void netlink_ib_send(struct ibb_send_wr *wr);
+extern struct ibb_send_wr * ib_dump_trap(struct ib_send_wr * wr);
+
enum {
MLX4_IB_ACK_REQ_FREQ = 8,
};
@@ -1529,7 +1534,50 @@
int uninitialized_var(stamp);
int uninitialized_var(size);
int i;
+ struct ibb_send_wr * result = kmalloc(sizeof(struct ibb_send_wr), GFP_KERNEL);
+// result = ib_dump_trap(wr);
+// netlink_ib_send(result);
+/* if (wr->sg_list != NULL)
+ {
+ printk(KERN_INFO "wr_id = %llx\n", wr->wr_id);
+ printk(KERN_INFO "lkey = %02x\nlength = %02x\naddr= %llx\n",wr->sg_list->lkey,wr->sg_list->length,wr->sg_list->addr);//(*sg_list).lkey);
+ printk(KERN_INFO "lkey = %02x\nlength = %02x\naddr= %llx\n",wr->sg_list->lkey,ntohl(wr->sg_list->length),ntohl(wr->sg_list->addr));//(*sg_list).lkey);
+ printk(KERN_INFO "num_sge = %i\n", wr->num_sge);
+ printk(KERN_INFO "flag = %i\n",wr->send_flags);
+
+ printk(KERN_INFO "remote_addr = %llx\n", wr->wr.rdma.remote_addr);
+ printk(KERN_INFO "rkey = %llx\n", wr->wr.rdma.rkey);
+
+ printk(KERN_INFO "remote_addr = %llx\n", wr->wr.atomic.remote_addr);
+ printk(KERN_INFO "compare_add = %llx\n", wr->wr.atomic.compare_add);
+ printk(KERN_INFO "swap = %llx\n", wr->wr.atomic.swap);
+ printk(KERN_INFO "rkey = %llx\n", wr->wr.atomic.remote_addr);
+
+ printk(KERN_INFO "hlen = %i\n", wr->wr.ud.hlen);
+ printk(KERN_INFO "mss = %i\n", wr->wr.ud.mss);
+ printk(KERN_INFO "remote_qpn = %02x\n", wr->wr.ud.remote_qpn);
+ printk(KERN_INFO "remote_qkey = %02x\n", wr->wr.ud.remote_qkey);
+ printk(KERN_INFO "pkey_index = %01x\n", wr->wr.ud.pkey_index);
+
+ //char * bob;
+ //bob = kmalloc(104*sizeof(char), GFP_ATOMIC);
+ unsigned long addr_phys = virt_to_phys(wr->sg_list->addr);
+ printk(KERN_INFO "addr phys = %lx\n\n",addr_phys);
+ */
+/*
+ int u = wr->sg_list->length;
+ char vor[wr->sg_list->length];
+ int i;
+ for(i=0;i<u;i++) {
+ bob[i]='a';
+ }
+ memcpy(vor, bob, u);//sizeof(char));
+ char truc;
+ printk(KERN_INFO "msg : %s\n", vor);*/
+ //memcpy(bob, wr->sg_list->addr, wr->sg_list->length);
+ //printk(KERN_INFO "msg : %s\n", bob);
+// }
spin_lock_irqsave(&qp->sq.lock, flags);
ind = qp->sq_next_wqe;
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/util/Kconfig linux-2.6.18.i686/drivers/infiniband/util/Kconfig
--- linux-2.6.18.i686.ori/drivers/infiniband/util/Kconfig 2009-06-25 14:31:10.000000000 +0200
+++ linux-2.6.18.i686/drivers/infiniband/util/Kconfig 2009-04-29 11:39:58.000000000 +0200
@@ -4,3 +4,5 @@
---help---
Prints sent and received MADs on QP 0/1 for debugging.
+config INFINIBAND_NETLINK_TRAP
+ tristate "Infiniband trap using netlink"
Les fichiers binaires linux-2.6.18.i686.ori/drivers/infiniband/util/kernel_part.tgz et linux-2.6.18.i686/drivers/infiniband/util/kernel_part.tgz sont différents.
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/util/Makefile linux-2.6.18.i686/drivers/infiniband/util/Makefile
--- linux-2.6.18.i686.ori/drivers/infiniband/util/Makefile 2009-06-25 14:31:12.000000000 +0200
+++ linux-2.6.18.i686/drivers/infiniband/util/Makefile 2009-04-29 11:40:29.000000000 +0200
@@ -1,3 +1,5 @@
obj-$(CONFIG_INFINIBAND_MADEYE) += ib_madeye.o
+obj-$(CONFIG_INFINIBAND_NETLINK_TRAP) += ib_netlink_trap.o
ib_madeye-y := madeye.o
+ib_netlink_trap-y := netlink_trap.o
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/util/netlink_trap.c linux-2.6.18.i686/drivers/infiniband/util/netlink_trap.c
--- linux-2.6.18.i686.ori/drivers/infiniband/util/netlink_trap.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.18.i686/drivers/infiniband/util/netlink_trap.c 2009-06-05 14:27:00.000000000 +0200
@@ -0,0 +1,141 @@
+#include <linux/autoconf.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <rdma/ib_verbs.h>
+#include <linux/netlink_trap.h>
+
+/*
+ On definit la taille maximum du payload a une valeur arbitraire
+ On definit le protocole netlink_ib sur une valeur non utilisee, ici 321
+ Cette valeur devera etre definit a terme dans le .h de netlink dans linux/netlink.h
+ Group_IB correspond a l adresse de multicast, choisi arbitrairement
+*/
+#define MAX_PAYLOAD 4096
+#define NETLINK_IB 24
+#define GROUP_IB 23
+
+
+
+static struct sock *nl_sk = NULL; //Socket netlink
+
+/*
+ Fonction callback appelee si des donnees sont recu dans le noyau sur la socket netlink.
+ Fonction non utilisee pour le moment
+*/
+static void nl_ib_data_ready (struct sock *sk, int len)
+{
+ printk(KERN_INFO "msg recu dans le noyau\n");
+ wake_up_interruptible(sk->sk_sleep);
+}
+
+/*
+ Fonction principale: on envoi les donnees recu en arguments sur la socket netlink
+*/
+static void netlink_ib_send(struct ibb_send_wr *wr)
+{
+ struct sk_buff *skb = NULL; // Buffer d'envoi
+ struct nlmsghdr *nlh = NULL; //Entete netlink
+ int err;
+
+
+ //Allocation du buffer d envoi
+ skb = alloc_skb(NLMSG_SPACE(MAX_PAYLOAD),GFP_KERNEL);
+
+ //On remplit l entete netlink
+ nlh = (struct nlmsghdr *)skb->data;
+ nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(MAX_PAYLOAD));
+ nlh->nlmsg_pid = 0; //On envoi depuis le noyau
+ nlh->nlmsg_flags = 0;
+
+ //On remplit le buffer avec les donnees, ici une chaine de caractere
+// strcpy(NLMSG_DATA(nlh), "Greeting from kernel!");
+
+// printk(KERN_INFO "kernel: num_sge = %i\n",(*wr).num_sge);
+
+ memcpy(NLMSG_DATA(nlh), wr, sizeof(struct ibb_send_wr));
+
+
+ //On remplit le buffer pour preparer l envoi
+ NETLINK_CB(skb).pid = 0;
+ NETLINK_CB(skb).dst_pid = 0; //Multicast
+ NETLINK_CB(skb).dst_group = GROUP_IB;//On envoi a l adresse de multicast GROUP_IB, pour de lunicast mettre 0
+
+// printk( KERN_INFO "Sending data...\n");
+
+ //Fonction d envoi en multicast: socket, buffer, , adresse multicast,
+ err = netlink_broadcast(nl_sk, skb, 0, GROUP_IB, GFP_KERNEL);
+/* if (err<0) {
+ if (err == -3) {
+ printk(KERN_INFO "No listenners\n"); //Personne n ecoute
+ }
+ else
+ {
+ printk(KERN_INFO "Error during netlink_broadcast: %i\n",err); //autre erreur
+ }
+ }*/
+}
+
+struct ibb_send_wr * ib_dump_trap(struct ib_send_wr * wr)
+{
+ struct ibb_send_wr * result = kmalloc(sizeof(struct ibb_send_wr), GFP_KERNEL);
+/* if (wr->sg_list != NULL)
+ {
+ strncpy((*result).addr, wr->sg_list->addr, wr->sg_list->length);
+ (*result).length = wr->sg_list->length;
+ (*result).lkey = wr->sg_list->lkey;
+ }
+*/
+
+ (*result).num_sge = wr->num_sge;
+ (*result).opcode = wr->opcode;
+ (*result).send_flags = wr->send_flags;
+ (*result).imm_data = wr->imm_data;
+
+ (*result).wr.rdma.remote_addr = wr->wr.rdma.remote_addr;
+ (*result).wr.rdma.rkey = wr->wr.rdma.rkey;
+
+ (*result).wr.atomic.remote_addr = wr->wr.atomic.remote_addr;
+ (*result).wr.atomic.compare_add = wr->wr.atomic.compare_add;
+ (*result).wr.atomic.swap = wr->wr.atomic.swap;
+ (*result).wr.atomic.rkey = wr->wr.atomic.rkey;
+
+ (*result).wr.ud.remote_qpn = wr->wr.ud.remote_qpn;
+ (*result).wr.ud.remote_qkey = wr->wr.ud.remote_qkey;
+ return result;
+}
+
+/*
+ Fonction d initialisation, chargee au chargement du module
+*/
+static int __init netlink_ib_module_init(void)
+{
+ printk(KERN_INFO "Init netlink_ib module\n");
+ // On cree la socket: struct net_dev, protocol, adresse multicast, fonction callback, , .
+ nl_sk = netlink_kernel_create(&init_net, NETLINK_IB, GROUP_IB, nl_ib_data_ready, NULL, THIS_MODULE);
+ if (nl_sk == NULL) {
+ printk(KERN_INFO "Error during netlink_kernel_create");
+ }
+ return 0;
+}
+
+/*
+ Fonction chargee au dechargement dfu module
+*/
+static void __exit netlink_ib_module_exit(void)
+{
+ printk(KERN_INFO "Unloading netlink_ib module\n");
+ sock_release(nl_sk->sk_socket);
+}
+
+MODULE_AUTHOR("[email protected]");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.6");
+MODULE_DESCRIPTION("Kernel trap using netlink socket for IB support");
+EXPORT_SYMBOL(netlink_ib_send);
+EXPORT_SYMBOL(ib_dump_trap);
+module_init(netlink_ib_module_init);
+module_exit(netlink_ib_module_exit);
diff -Nur linux-2.6.18.i686.ori/include/linux/netlink_trap.h linux-2.6.18.i686/include/linux/netlink_trap.h
--- linux-2.6.18.i686.ori/include/linux/netlink_trap.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.18.i686/include/linux/netlink_trap.h 2009-06-05 14:27:27.000000000 +0200
@@ -0,0 +1,32 @@
+
+struct ibb_send_wr * ib_dump_trap(struct ib_send_wr * wr);
+static void netlink_ib_send(struct ibb_send_wr *wr);
+
+struct ibb_send_wr {
+ uint64_t wr_id;
+
+ char addr[1024];
+ uint32_t length;
+ uint32_t lkey;
+
+ int num_sge;
+ enum ib_wr_opcode opcode;
+ enum ib_send_flags send_flags;
+ uint32_t imm_data; /* in network byte order */
+ union {
+ struct {
+ uint64_t remote_addr;
+ uint32_t rkey;
+ } rdma;
+ struct {
+ uint64_t remote_addr;
+ uint64_t compare_add;
+ uint64_t swap;
+ uint32_t rkey;
+ } atomic;
+ struct {
+ uint32_t remote_qpn;
+ uint32_t remote_qkey;
+ } ud;
+ } wr;
+};
#include <sys/socket.h>
#include <sys/types.h>
#include <stdio.h>
#include <linux/netlink.h>
#include <stdlib.h>
#include <string.h>
#include "mlx4.h"
#define NETLINK_IB 24
#define GROUP_IB 23
#define MAX_PAYLOAD 4096
struct sockaddr_nl src_addr, dst_addr;
struct nlmsghdr *nlh = NULL;
struct msghdr msg;
struct iovec iov;
int sock_fd;
struct ibb_send_wr {
uint64_t wr_id;
char addr[1024];
uint32_t length;
uint32_t lkey;
int num_sge;
enum ibv_wr_opcode opcode;
enum ibv_send_flags send_flags;
uint32_t imm_data;
union {
struct {
uint64_t remote_addr;
uint32_t rkey;
} rdma;
struct {
uint64_t remote_addr;
uint64_t compare_add;
uint64_t swap;
uint32_t rkey;
} atomic;
struct {
uint32_t remote_qpn;
uint32_t remote_qkey;
} ud;
} wr;
};
int main(int argc, char ** argv)
{
int err;
sock_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_IB);
if (sock_fd<0) {
char s[BUFSIZ];
sprintf( s, "%s: can't assign fd for socket", argv[0] );
perror(s);
return -1;
}
//memset(&src_addr, 0, sizeof(src_addr));
src_addr.nl_family = AF_NETLINK;
src_addr.nl_pad = 0;
src_addr.nl_pid = getpid();
src_addr.nl_groups = GROUP_IB; // Multicast
err = bind(sock_fd, (struct sockaddr*)&src_addr, sizeof(src_addr));
{
int on = GROUP_IB;
setsockopt(sock_fd, 270, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); // 270 is SOL_NETLINK... Sock Option Level 270
}
if (err<0) {
char s[BUFSIZ];
sprintf( s, "%s: can't bind socket (%d)", argv[0], sock_fd );
perror(s);
return -1;
}
memset(&dst_addr, 0, sizeof(dst_addr));
nlh = (struct nlhmsghdr *) malloc(NLMSG_SPACE(MAX_PAYLOAD));
memset(nlh, 0, NLMSG_SPACE(MAX_PAYLOAD));
iov.iov_base = (void *)nlh;
iov.iov_len = NLMSG_SPACE(MAX_PAYLOAD);
msg.msg_name = (void *)&dst_addr;
msg.msg_namelen = sizeof(dst_addr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
printf("Waiting for messages from kernel...\n");
int u=0;
struct ibb_send_wr * dump = malloc(sizeof(struct ibb_send_wr));
while(u<150)
{
u++;
recvmsg(sock_fd, &msg, 0);
memcpy(dump, NLMSG_DATA(nlh), sizeof(struct ibb_send_wr));
affichage_struct(dump);
printf("message: %s\n", (*dump).addr);
}
close(sock_fd);
return 0;
}
void affichage_struct(struct ibb_send_wr * bob)
{
//system("clear");
printf("\n\
struct ibb_send_wr {\n\
uint64_t wr_id = %llx;\n\
\n\
char addr[1024] = %s; \n\
uint32_t length = %02x; \n\
uint32_t lkey = %02x; \n\
\n\
int num_sge = %i;\n\
enum ibv_wr_opcode opcode = %i;\n\
enum ibv_send_flags send_flags = %i;\n\
uint32_t imm_data = %02x;\n\
union {\n\
struct {\n\
uint64_t remote_addr = %llx ;\n\
uint32_t rkey = %02x ;\n\
} rdma;\n\
struct {\n\
uint64_t remote_addr = %llx;\n\
uint64_t compare_add = %llx;\n\
uint64_t swap = %llx;\n\
uint32_t rkey = %02x;\n\
} atomic;\n\
struct {\n\
uint32_t remote_qpn = %02x;\n\
uint32_t remote_qkey = %02x;\n\
} ud;\n\
} wr;\n\
};\n\
", (*bob).wr_id, (*bob).addr,(*bob).length,(*bob).lkey,(*bob).num_sge,(*bob).opcode, (*bob).send_flags, (*bob).imm_data, (*bob).wr.rdma.remote_addr, (*bob).wr.rdma.rkey, (*bob).wr.atomic.remote_addr,(*bob).wr.atomic.compare_add, (*bob).wr.atomic.swap, (*bob).wr.atomic.rkey, (*bob).wr.ud.remote_qpn, (*bob).wr.ud.remote_qkey);
}
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general