Hi,

Here is my patch, I have worked on kernel 2.6.18 on centos 5.2
I get some trouble while accessing struc sg_list, but this
implementention allow forwarding information from struct ib_send_wr in
function post_send (in mlx4/qp.c).
The program in userland is also attached but it only print
informations on console.

I will keep looking at this,

Regards,

Thierry



On Wed, Jun 24, 2009 at 7:55 PM, Jason
Gunthorpe<[email protected]> wrote:
> On Wed, Jun 24, 2009 at 06:55:43PM +0300, Moni Shoua wrote:
>
>> I believe that Jason and I still disagree but...
>
>> Jason suggests that I implement this feature with netlink. This
>> approach might have an advantage but if I understand it right this
>> approach requires a patch also to some user application in order to
>> take advantage of this patch.
>
> The kernel devs have made it clear that the preferred way to export
> this information is through netlink and/or a file in /proc/net/..
>
> You never got an answer if /proc/net is truely discouraged or not.
>
> debugfs is completely useless because it cannot be used for actual
> end-user interrogation and the rules are it should not be used as a
> stable kernel-user interface. (ie is NOT a new dumping ground like
> /proc/ was)
>
>> Also, I think that there is a value for a virtual text file under
>> debugfs to monitor rdma_cm connections easily and without any other
>> special app besides 'cat' (just like in IPoIB).
>
> If you have the netlink based program there really is no point in
> including something under debugfs. It is just bloat.
>
>> Finally, this implementation doesn't contradict netlink
>> implementation in the future (and it won't be the first time).
>
> As I said before, the stack has reached a level of maturity that new
> stuff going in should meaningfully 'move the ball forward' toward a
> mature and complete stack. In the context of state reporting that
> means a stable user interface that programs like lsof, netstat, etc,
> can rely upon.
>
> Putting something under debugfs clearly does nothing to advance that
> goal.
>
> Further, there is clearly alot of state information we should be
> exporting to userspace. RDMA-CM stuff is only a tiny portion. netlink
> is the kernel devs answer to all of this.
>
> Jason
>
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/hw/mlx4/qp.c linux-2.6.18.i686/drivers/infiniband/hw/mlx4/qp.c
--- linux-2.6.18.i686.ori/drivers/infiniband/hw/mlx4/qp.c	2009-06-25 14:31:17.000000000 +0200
+++ linux-2.6.18.i686/drivers/infiniband/hw/mlx4/qp.c	2009-06-08 10:14:32.000000000 +0200
@@ -39,6 +39,11 @@
 #include "mlx4_ib.h"
 #include "user.h"
 
+#include <linux/netlink_trap.h>
+
+extern void netlink_ib_send(struct ibb_send_wr *wr);
+extern struct ibb_send_wr * ib_dump_trap(struct ib_send_wr * wr);
+
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
 };
@@ -1529,7 +1534,50 @@
 	int uninitialized_var(stamp);
 	int uninitialized_var(size);
 	int i;
+	struct ibb_send_wr * result = kmalloc(sizeof(struct ibb_send_wr), GFP_KERNEL);
+//	result = ib_dump_trap(wr);
+//	netlink_ib_send(result);
 
+/*	if (wr->sg_list != NULL)
+	{
+		printk(KERN_INFO "wr_id = %llx\n",  wr->wr_id);
+		printk(KERN_INFO "lkey = %02x\nlength = %02x\naddr= %llx\n",wr->sg_list->lkey,wr->sg_list->length,wr->sg_list->addr);//(*sg_list).lkey);
+		printk(KERN_INFO "lkey = %02x\nlength = %02x\naddr= %llx\n",wr->sg_list->lkey,ntohl(wr->sg_list->length),ntohl(wr->sg_list->addr));//(*sg_list).lkey);
+		printk(KERN_INFO "num_sge = %i\n",  wr->num_sge);
+		printk(KERN_INFO "flag = %i\n",wr->send_flags);
+
+		printk(KERN_INFO "remote_addr = %llx\n",  wr->wr.rdma.remote_addr);
+		printk(KERN_INFO "rkey = %llx\n",  wr->wr.rdma.rkey);
+
+		printk(KERN_INFO "remote_addr = %llx\n",  wr->wr.atomic.remote_addr);
+		printk(KERN_INFO "compare_add = %llx\n",  wr->wr.atomic.compare_add);
+		printk(KERN_INFO "swap = %llx\n",  wr->wr.atomic.swap);
+		printk(KERN_INFO "rkey = %llx\n",  wr->wr.atomic.remote_addr);
+
+		printk(KERN_INFO "hlen = %i\n",  wr->wr.ud.hlen);
+		printk(KERN_INFO "mss = %i\n",  wr->wr.ud.mss);
+		printk(KERN_INFO "remote_qpn = %02x\n",  wr->wr.ud.remote_qpn);
+		printk(KERN_INFO "remote_qkey = %02x\n",  wr->wr.ud.remote_qkey);
+		printk(KERN_INFO "pkey_index = %01x\n",  wr->wr.ud.pkey_index);
+
+		//char * bob;
+		//bob = kmalloc(104*sizeof(char), GFP_ATOMIC);
+		unsigned long addr_phys = virt_to_phys(wr->sg_list->addr);
+		printk(KERN_INFO "addr phys = %lx\n\n",addr_phys);
+		*/
+/*
+		int u = wr->sg_list->length;
+		char vor[wr->sg_list->length];
+		int i;
+		for(i=0;i<u;i++) {
+			bob[i]='a';
+		}
+			memcpy(vor, bob, u);//sizeof(char));
+		char truc;
+		printk(KERN_INFO "msg : %s\n", vor);*/
+		//memcpy(bob, wr->sg_list->addr, wr->sg_list->length);
+		//printk(KERN_INFO "msg : %s\n", bob);
+//	}
 	spin_lock_irqsave(&qp->sq.lock, flags);
 
 	ind = qp->sq_next_wqe;
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/util/Kconfig linux-2.6.18.i686/drivers/infiniband/util/Kconfig
--- linux-2.6.18.i686.ori/drivers/infiniband/util/Kconfig	2009-06-25 14:31:10.000000000 +0200
+++ linux-2.6.18.i686/drivers/infiniband/util/Kconfig	2009-04-29 11:39:58.000000000 +0200
@@ -4,3 +4,5 @@
 	---help---
 	  Prints sent and received MADs on QP 0/1 for debugging.
 
+config INFINIBAND_NETLINK_TRAP
+        tristate "Infiniband trap using netlink"
Les fichiers binaires linux-2.6.18.i686.ori/drivers/infiniband/util/kernel_part.tgz et linux-2.6.18.i686/drivers/infiniband/util/kernel_part.tgz sont différents.
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/util/Makefile linux-2.6.18.i686/drivers/infiniband/util/Makefile
--- linux-2.6.18.i686.ori/drivers/infiniband/util/Makefile	2009-06-25 14:31:12.000000000 +0200
+++ linux-2.6.18.i686/drivers/infiniband/util/Makefile	2009-04-29 11:40:29.000000000 +0200
@@ -1,3 +1,5 @@
 obj-$(CONFIG_INFINIBAND_MADEYE)	+= ib_madeye.o
+obj-$(CONFIG_INFINIBAND_NETLINK_TRAP) += ib_netlink_trap.o
 
 ib_madeye-y := madeye.o
+ib_netlink_trap-y := netlink_trap.o
diff -Nur linux-2.6.18.i686.ori/drivers/infiniband/util/netlink_trap.c linux-2.6.18.i686/drivers/infiniband/util/netlink_trap.c
--- linux-2.6.18.i686.ori/drivers/infiniband/util/netlink_trap.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.18.i686/drivers/infiniband/util/netlink_trap.c	2009-06-05 14:27:00.000000000 +0200
@@ -0,0 +1,141 @@
+#include <linux/autoconf.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <rdma/ib_verbs.h>
+#include <linux/netlink_trap.h>
+
+/*
+  On definit la taille maximum du payload a une valeur arbitraire
+  On definit le protocole netlink_ib sur une valeur non utilisee, ici 321
+  Cette valeur devera etre definit a terme dans le .h de netlink dans linux/netlink.h
+  Group_IB correspond a l adresse de multicast, choisi arbitrairement
+*/
+#define MAX_PAYLOAD 4096
+#define NETLINK_IB 24
+#define GROUP_IB 23
+
+
+
+static struct sock *nl_sk = NULL; //Socket netlink
+
+/*
+  Fonction callback appelee si des donnees sont recu dans le noyau sur la socket netlink.
+  Fonction non utilisee pour le moment  
+*/
+static void nl_ib_data_ready (struct sock *sk, int len)
+{
+	printk(KERN_INFO "msg recu dans le noyau\n");
+	wake_up_interruptible(sk->sk_sleep);
+}
+
+/*
+  Fonction principale: on envoi les donnees recu en arguments sur la socket netlink
+*/
+static void netlink_ib_send(struct ibb_send_wr *wr)
+{
+	struct sk_buff *skb = NULL; // Buffer d'envoi
+	struct nlmsghdr *nlh = NULL; //Entete netlink
+	int err;
+
+
+	//Allocation du buffer d envoi
+	skb = alloc_skb(NLMSG_SPACE(MAX_PAYLOAD),GFP_KERNEL);
+
+	//On remplit l entete netlink
+	nlh = (struct nlmsghdr *)skb->data;
+	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(MAX_PAYLOAD));
+	nlh->nlmsg_pid = 0; //On envoi depuis le noyau
+	nlh->nlmsg_flags = 0;
+
+	//On remplit le buffer avec les donnees, ici une chaine de caractere
+//	strcpy(NLMSG_DATA(nlh), "Greeting from kernel!");
+	
+//	printk(KERN_INFO "kernel: num_sge = %i\n",(*wr).num_sge);
+
+	memcpy(NLMSG_DATA(nlh), wr, sizeof(struct ibb_send_wr));
+	
+
+	//On remplit le buffer pour preparer l envoi
+	NETLINK_CB(skb).pid = 0;
+	NETLINK_CB(skb).dst_pid = 0; //Multicast
+	NETLINK_CB(skb).dst_group = GROUP_IB;//On envoi a l adresse de multicast GROUP_IB, pour de lunicast mettre 0
+	
+//	printk( KERN_INFO "Sending data...\n");
+
+	//Fonction d envoi en multicast: socket, buffer, , adresse multicast, 
+	err = netlink_broadcast(nl_sk, skb, 0, GROUP_IB, GFP_KERNEL);
+/*	if (err<0) {
+		if (err == -3) {
+			printk(KERN_INFO "No listenners\n"); //Personne n ecoute
+		}
+		else
+		{
+			printk(KERN_INFO "Error during netlink_broadcast: %i\n",err); //autre erreur
+		}
+	}*/
+}
+
+struct ibb_send_wr * ib_dump_trap(struct ib_send_wr * wr)
+{
+	struct ibb_send_wr * result = kmalloc(sizeof(struct ibb_send_wr), GFP_KERNEL);
+/*	if (wr->sg_list != NULL)
+	{
+		strncpy((*result).addr, wr->sg_list->addr, wr->sg_list->length);
+		(*result).length 		=	wr->sg_list->length;
+		(*result).lkey 		=	wr->sg_list->lkey;
+	}
+*/
+
+	(*result).num_sge			=	wr->num_sge;
+	(*result).opcode			=	wr->opcode;
+	(*result).send_flags		=	wr->send_flags;
+	(*result).imm_data			=	wr->imm_data;
+
+	(*result).wr.rdma.remote_addr	=	wr->wr.rdma.remote_addr;
+	(*result).wr.rdma.rkey		=	wr->wr.rdma.rkey;
+
+	(*result).wr.atomic.remote_addr	=	wr->wr.atomic.remote_addr;
+	(*result).wr.atomic.compare_add	=	wr->wr.atomic.compare_add;
+	(*result).wr.atomic.swap		=	wr->wr.atomic.swap;
+	(*result).wr.atomic.rkey		=	wr->wr.atomic.rkey;
+
+	(*result).wr.ud.remote_qpn		=	wr->wr.ud.remote_qpn;
+	(*result).wr.ud.remote_qkey	=	wr->wr.ud.remote_qkey;
+	return result;
+}
+
+/*
+  Fonction d initialisation, chargee au chargement du module
+*/
+static int __init netlink_ib_module_init(void)
+{
+	printk(KERN_INFO "Init netlink_ib module\n");
+	// On cree la socket: struct net_dev, protocol, adresse multicast, fonction callback, , .
+	nl_sk = netlink_kernel_create(&init_net, NETLINK_IB, GROUP_IB, nl_ib_data_ready, NULL, THIS_MODULE);
+	if (nl_sk == NULL) {
+		printk(KERN_INFO "Error during netlink_kernel_create");
+	}
+	return 0;
+}
+
+/*
+  Fonction chargee au dechargement dfu module
+*/
+static void __exit netlink_ib_module_exit(void)
+{
+	printk(KERN_INFO "Unloading netlink_ib module\n");
+	sock_release(nl_sk->sk_socket);
+}
+
+MODULE_AUTHOR("[email protected]");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.6");
+MODULE_DESCRIPTION("Kernel trap using netlink socket for IB support");
+EXPORT_SYMBOL(netlink_ib_send);
+EXPORT_SYMBOL(ib_dump_trap);
+module_init(netlink_ib_module_init);
+module_exit(netlink_ib_module_exit);
diff -Nur linux-2.6.18.i686.ori/include/linux/netlink_trap.h linux-2.6.18.i686/include/linux/netlink_trap.h
--- linux-2.6.18.i686.ori/include/linux/netlink_trap.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.18.i686/include/linux/netlink_trap.h	2009-06-05 14:27:27.000000000 +0200
@@ -0,0 +1,32 @@
+
+struct ibb_send_wr * ib_dump_trap(struct ib_send_wr * wr);
+static void netlink_ib_send(struct ibb_send_wr *wr);
+
+struct ibb_send_wr {
+	uint64_t		wr_id;
+
+	char			addr[1024];
+	uint32_t		length;
+	uint32_t		lkey;
+
+	int			num_sge;
+	enum ib_wr_opcode	opcode;
+	enum ib_send_flags	send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+};
#include <sys/socket.h>
#include <sys/types.h>
#include <stdio.h>
#include <linux/netlink.h>
#include <stdlib.h>
#include <string.h>

#include "mlx4.h"

#define NETLINK_IB 24
#define GROUP_IB 23
#define MAX_PAYLOAD 4096
struct sockaddr_nl src_addr, dst_addr;
struct nlmsghdr *nlh = NULL;
struct msghdr msg;
struct iovec iov;
int sock_fd;


struct ibb_send_wr {
	uint64_t		wr_id;

	char		addr[1024];
	uint32_t		length;
	uint32_t		lkey;

	int			num_sge;
	enum ibv_wr_opcode	opcode;
	enum ibv_send_flags	send_flags;
	uint32_t		imm_data;
	union {
		struct {
			uint64_t	remote_addr;
			uint32_t	rkey;
		} rdma;
		struct {
			uint64_t	remote_addr;
			uint64_t	compare_add;
			uint64_t	swap;
			uint32_t	rkey;
		} atomic;
		struct {
			uint32_t	remote_qpn;
			uint32_t	remote_qkey;
		} ud;
	} wr;
};


int main(int argc, char ** argv)
{
	int err;
	sock_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_IB);
	if (sock_fd<0) {
	        char s[BUFSIZ];
	        sprintf( s, "%s: can't assign fd for socket", argv[0] );
	        perror(s);
	        return -1;
	}

	//memset(&src_addr, 0, sizeof(src_addr));
	src_addr.nl_family = AF_NETLINK;
	src_addr.nl_pad = 0;
	src_addr.nl_pid = getpid();
	src_addr.nl_groups = GROUP_IB; // Multicast

	err = bind(sock_fd, (struct sockaddr*)&src_addr, sizeof(src_addr));
	{
		int on = GROUP_IB;
		setsockopt(sock_fd, 270, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); // 270 is SOL_NETLINK... Sock Option Level 270
	}
	if (err<0) {
	        char s[BUFSIZ];
        	sprintf( s, "%s: can't bind socket (%d)", argv[0], sock_fd );
	        perror(s);
	        return -1;
	}

	memset(&dst_addr, 0, sizeof(dst_addr));
	nlh = (struct nlhmsghdr *) malloc(NLMSG_SPACE(MAX_PAYLOAD));
	memset(nlh, 0, NLMSG_SPACE(MAX_PAYLOAD));

	iov.iov_base = (void *)nlh;
	iov.iov_len = NLMSG_SPACE(MAX_PAYLOAD);
	msg.msg_name = (void *)&dst_addr;
	msg.msg_namelen = sizeof(dst_addr);
	msg.msg_iov = &iov;
	msg.msg_iovlen = 1;

	printf("Waiting for messages from kernel...\n");
	int u=0;
	struct ibb_send_wr * dump = malloc(sizeof(struct ibb_send_wr));
	while(u<150)
	{
	u++;
		recvmsg(sock_fd, &msg, 0);
		memcpy(dump, NLMSG_DATA(nlh), sizeof(struct ibb_send_wr));
		affichage_struct(dump);
		printf("message: %s\n", (*dump).addr);
	}
	
	close(sock_fd);

	return 0;

}


void affichage_struct(struct ibb_send_wr * bob)
{
//system("clear");
printf("\n\
struct ibb_send_wr {\n\
        uint64_t                wr_id      = %llx;\n\
\n\
        char            addr[1024] = %s; \n\
        uint32_t        length     = %02x; \n\
        uint32_t        lkey       = %02x; \n\
\n\
        int                     num_sge    = %i;\n\
        enum ibv_wr_opcode      opcode     = %i;\n\
        enum ibv_send_flags     send_flags = %i;\n\
        uint32_t                imm_data   = %02x;\n\
        union {\n\
                struct {\n\
                        uint64_t        remote_addr = %llx ;\n\
                        uint32_t        rkey        = %02x ;\n\
                } rdma;\n\
               struct {\n\
                       uint64_t        remote_addr  = %llx;\n\
                       uint64_t        compare_add  = %llx;\n\
                       uint64_t        swap         = %llx;\n\
                       uint32_t        rkey         = %02x;\n\
               } atomic;\n\
               struct {\n\
                       uint32_t        remote_qpn   = %02x;\n\
                       uint32_t        remote_qkey  = %02x;\n\
               } ud;\n\
       } wr;\n\
};\n\
", (*bob).wr_id, (*bob).addr,(*bob).length,(*bob).lkey,(*bob).num_sge,(*bob).opcode, (*bob).send_flags, (*bob).imm_data, (*bob).wr.rdma.remote_addr, (*bob).wr.rdma.rkey, (*bob).wr.atomic.remote_addr,(*bob).wr.atomic.compare_add, (*bob).wr.atomic.swap, (*bob).wr.atomic.rkey, (*bob).wr.ud.remote_qpn, (*bob).wr.ud.remote_qkey);


}
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to