and here's the new patch, merging rusty's suggestions and some more on my own.

May I upload this, or does Rusty (or any other) has some more suggestions?

On 4/4/07, Rusty Russell <[EMAIL PROTECTED]> wrote:
On Wed, 2007-04-04 at 13:03 -0300, Glauber de Oliveira Costa wrote:
> This is a new version of the unified lguest launcher that applies to
> the current tree. According to rusty's suggestion, I'm bothering less
> to be able to load 32 bit kernels on 64-bit machines: changing the
> launcher for such case would be the easy part! In the absence of
> further objections, I'll commit it.
>
> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

Hi Glauber!

The patch looks more than reasonable, but I think we can go further with
the abstraction.  If you could spin it again, I'll apply it.  There may
be more cleanups after that, but I don't want to hold up your progress!


--
Glauber de Oliveira Costa.
"Free as in Freedom"

"The less confident you are, the more serious you have to act."
 Makefile             |    9 +-
 i386/defines         |    1 
 i386/lguest_defs.h   |    9 ++
 lguest.c             |  177 +++++++++++++++++++++++++++++++++++++++------------
 x86_64/defines       |    1 
 x86_64/lguest_defs.h |   11 +++
 6 files changed, 165 insertions(+), 43 deletions(-)
Index: linux-2.6-mm/Documentation/lguest/Makefile
===================================================================
--- linux-2.6-mm.orig/Documentation/lguest/Makefile
+++ linux-2.6-mm/Documentation/lguest/Makefile
@@ -1,12 +1,13 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
 
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+# We could uname -i, but it seems to return unknown on a bunch of locations
+ARCH:=$(shell uname -m | sed s/i[3456]86/i386/)
+
 include ../../.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+include $(ARCH)/defines
 
 CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
-	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds -I$(ARCH)
 LDLIBS:=-lz
 
 all: lguest.lds lguest
Index: linux-2.6-mm/Documentation/lguest/i386/defines
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/i386/defines
@@ -0,0 +1 @@
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
Index: linux-2.6-mm/Documentation/lguest/i386/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/i386/lguest_defs.h
@@ -0,0 +1,9 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include "../../../include/linux/lguest_launcher.h"
+typedef Elf32_Ehdr Elf_Ehdr;
+typedef Elf32_Phdr Elf_Phdr;
+#define ELF_MACHINE EM_386
+
+#endif
Index: linux-2.6-mm/Documentation/lguest/lguest.c
===================================================================
--- linux-2.6-mm.orig/Documentation/lguest/lguest.c
+++ linux-2.6-mm/Documentation/lguest/lguest.c
@@ -30,10 +30,11 @@
 #include <termios.h>
 #include <getopt.h>
 #include <zlib.h>
+typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
+#include <lguest_defs.h>
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
@@ -64,8 +65,8 @@ struct device
 
 	/* Watch DMA to this key if handle_input non-NULL. */
 	unsigned long watch_key;
-	u32 (*handle_output)(int fd, const struct iovec *iov,
-			     unsigned int num, struct device *me);
+	unsigned long (*handle_output)(int fd, const struct iovec *iov,
+			     	       unsigned int num, struct device *me);
 
 	/* Device-specific data. */
 	void *priv;
@@ -93,21 +94,31 @@ static void *map_zeroed_pages(unsigned l
 	return (void *)addr;
 }
 
+static inline bool is_vsyscall_segment(Elf_Phdr *phdr)
+{
+#ifdef __x86_64__
+	return phdr->p_vaddr == VSYSCALL_START;
+#else
+	return 0;
+#endif
+}
+
 /* Returns the entry point */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+static unsigned long map_elf(int elf_fd, Elf_Ehdr *ehdr,
 			     unsigned long *page_offset)
 {
+	Elf_Phdr phdr[ehdr->e_phnum];
 	void *addr;
-	Elf32_Phdr phdr[ehdr->e_phnum];
 	unsigned int i;
 
 	/* Sanity checks. */
 	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+	    || ehdr->e_machine != ELF_MACHINE
+	    || ehdr->e_phentsize != sizeof(phdr[0])
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(phdr[0]))
 		errx(1, "Malformed elf header");
 
+
 	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
 		err(1, "Seeking to program headers");
 	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
@@ -120,13 +131,14 @@ static unsigned long map_elf(int elf_fd,
 		if (phdr[i].p_type != PT_LOAD)
 			continue;
 
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+		verbose("Section %i: size %lu addr %p\n",
+		i, (unsigned long)phdr[i].p_memsz, (void *)phdr[i].p_paddr);
 
 		/* We expect linear address space. */
 		if (!*page_offset)
 			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+		else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+			 && (!is_vsyscall_segment(&phdr[i])))
 			errx(1, "Page offset of section %i different", i);
 
 		/* We map everything private, writable. */
@@ -210,15 +222,18 @@ static unsigned long load_bzimage(int fd
 	errx(1, "Could not find kernel in bzImage");
 }
 
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+/* In x86_64 systems, page table hierarchy does not start at a common location,
+ * but rather, is indicated by a symbol called boot_level4_pgt. Due to this, we
+ * need the elf header to survive this function, to lookup for the pgdir */
+static unsigned long load_kernel(int fd, unsigned long *page_offset,
+								Elf_Ehdr *hdr)
 {
-	Elf32_Ehdr hdr;
 
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+	if (read(fd, hdr, sizeof(*hdr)) != sizeof(*hdr))
 		err(1, "Reading kernel");
 
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr, page_offset);
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) == 0)
+		return map_elf(fd, hdr, page_offset);
 
 	return load_bzimage(fd, page_offset);
 }
@@ -250,6 +265,7 @@ static inline unsigned long page_align(u
 	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
 }
 
+#ifndef __x86_64__
 static unsigned long setup_pagetables(unsigned long mem,
 				      unsigned long initrd_size,
 				      unsigned long page_offset)
@@ -285,6 +301,74 @@ static unsigned long setup_pagetables(un
 
 	return (unsigned long)pgdir;
 }
+#else
+static unsigned long find_pgt_symbol(int elf_fd,
+				     Elf_Ehdr *ehdr,
+				     unsigned long page_offset)
+{
+	unsigned int i;
+	Elf64_Shdr sec[ehdr->e_shnum];
+	Elf64_Sym *syms;
+	char *strtab = NULL;
+	unsigned long pgdir_addr = 0;
+	unsigned long nsyms = 0;
+
+	/* Now process sections searching for boot page tables
+	 * Start by finding the symtab section */
+	if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+		err(1, "Seeking to section headers");
+	if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+		err(1, "Reading section headers");
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (sec[i].sh_type == SHT_SYMTAB) {
+			int ret = 0;
+			syms = malloc(sec[i].sh_size);
+			if (!syms)
+				err(1,"Not enough memory for symbol table");
+			ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to symbol table");
+			ret = read(elf_fd, syms, sec[i].sh_size);
+			if (ret != sec[i].sh_size)
+				err(1, "Reading symbol table");
+			nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+			/* symtab links to strtab. We use it to find symbol
+			 * names */
+			strtab = malloc(sec[sec[i].sh_link].sh_size);
+			if (!strtab)
+				err(1,"Not enough memory for string table");
+			ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to string table");
+			ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+			if (ret != sec[sec[i].sh_link].sh_size)
+				err(1, "Reading string table");
+			break;
+		}
+	}
+
+	/* We now have a pointer to the symtab, start searching for the symbol */
+	for (i = 0; i < nsyms; i++) {
+		if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+			continue;
+		if (!strcmp("boot_level4_pgt",
+				(char *)((u64)syms[i].st_name + strtab))) {
+			pgdir_addr = syms[i].st_value - page_offset;
+			break;
+		}
+	}
+
+	if (!pgdir_addr) {
+		errno = ESRCH;
+		err(1,"Unable to find boot pgdir");
+	}
+
+	return pgdir_addr;
+}
+#endif
 
 static void concat(char *dst, char *args[])
 {
@@ -299,9 +383,10 @@ static void concat(char *dst, char *args
 	dst[len] = '\0';
 }
 
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(unsigned long pgdir, unsigned long start,
+					    unsigned long page_offset)
 {
-	u32 args[] = { LHREQ_INITIALIZE,
+	unsigned long args[] = { LHREQ_INITIALIZE,
 		       LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
 		       pgdir, start, page_offset };
 	int fd;
@@ -379,7 +464,8 @@ static void *_check_pointer(unsigned lon
 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
 
 /* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+static unsigned long *dma2iov(unsigned long dma, struct iovec iov[],
+								unsigned *num)
 {
 	unsigned int i;
 	struct lguest_dma *udma;
@@ -396,12 +482,12 @@ static u32 *dma2iov(unsigned long dma, s
 	return &udma->used_len;
 }
 
-static u32 *get_dma_buffer(int fd, void *key,
+static unsigned long *get_dma_buffer(int fd, void *key,
 			   struct iovec iov[], unsigned int *num, u32 *irq)
 {
-	u32 buf[] = { LHREQ_GETDMA, (u32)key };
+	unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)key };
 	unsigned long udma;
-	u32 *res;
+	unsigned long *res;
 
 	udma = write(fd, buf, sizeof(buf));
 	if (udma == (unsigned long)-1)
@@ -415,7 +501,7 @@ static u32 *get_dma_buffer(int fd, void 
 
 static void trigger_irq(int fd, u32 irq)
 {
-	u32 buf[] = { LHREQ_IRQ, irq };
+	unsigned long buf[] = { LHREQ_IRQ, irq };
 	if (write(fd, buf, sizeof(buf)) != 0)
 		err(1, "Triggering irq %i", irq);
 }
@@ -443,7 +529,8 @@ struct console_abort
 /* We DMA input to buffer bound at start of console page. */
 static bool handle_console_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0;
+	unsigned long *lenp;
 	int len;
 	unsigned int num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -487,14 +574,14 @@ static bool handle_console_input(int fd,
 	return true;
 }
 
-static u32 handle_console_output(int fd, const struct iovec *iov,
-				 unsigned num, struct device*dev)
+static unsigned long  handle_console_output(int fd, const struct iovec *iov,
+					 unsigned num, struct device*dev)
 {
 	return writev(STDOUT_FILENO, iov, num);
 }
 
-static u32 handle_tun_output(int fd, const struct iovec *iov,
-			     unsigned num, struct device *dev)
+static unsigned long  handle_tun_output(int fd, const struct iovec *iov,
+					     unsigned num, struct device *dev)
 {
 	/* Now we've seen output, we should warn if we can't get buffers. */
 	*(bool *)dev->priv = true;
@@ -508,7 +595,8 @@ static unsigned long peer_offset(unsigne
 
 static bool handle_tun_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0;
+	unsigned long *lenp;
 	int len;
 	unsigned num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -534,11 +622,12 @@ static bool handle_tun_input(int fd, str
 	return true;
 }
 
-static u32 handle_block_output(int fd, const struct iovec *iov,
-			       unsigned num, struct device *dev)
+static unsigned long handle_block_output(int fd, const struct iovec *iov,
+					 unsigned num, struct device *dev)
 {
 	struct lguest_block_page *p = dev->mem;
-	u32 irq, *lenp;
+	u32 irq;
+	unsigned long *lenp;
 	unsigned int len, reply_num;
 	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
 	off64_t device_len, off = (off64_t)p->sector * 512;
@@ -546,11 +635,14 @@ static u32 handle_block_output(int fd, c
 	device_len = *(off64_t *)dev->priv;
 
 	if (off >= device_len)
-		err(1, "Bad offset %llu vs %llu", off, device_len);
+		err(1, "Bad offset %llu vs %llu",
+				(unsigned long long)off,
+				(unsigned long long)device_len);
 	if (lseek64(dev->fd, off, SEEK_SET) != off)
 		err(1, "Bad seek to sector %i", p->sector);
 
-	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ",
+						(unsigned long long)off);
 
 	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
 	if (!lenp)
@@ -560,7 +652,8 @@ static u32 handle_block_output(int fd, c
 		len = writev(dev->fd, iov, num);
 		if (off + len > device_len) {
 			ftruncate(dev->fd, device_len);
-			errx(1, "Write past end %llu+%u", off, len);
+			errx(1, "Write past end %llu+%u",
+					(unsigned long long)off, len);
 		}
 		*lenp = 0;
 	} else {
@@ -577,7 +670,7 @@ static void handle_output(int fd, unsign
 			  struct device_list *devices)
 {
 	struct device *i;
-	u32 *lenp;
+	unsigned long *lenp;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
 	unsigned num = 0;
 
@@ -639,7 +732,7 @@ static struct device *new_device(struct 
 				 int fd,
 				 bool (*handle_input)(int, struct device *),
 				 unsigned long watch_off,
-				 u32 (*handle_output)(int,
+				 unsigned long (*handle_output)(int,
 						      const struct iovec *,
 						      unsigned,
 						      struct device *))
@@ -916,9 +1009,11 @@ int main(int argc, char *argv[])
 {
 	unsigned long mem, pgdir, start, page_offset;
 	int c, lguest_fd, waker_fd;
+	int elf_fd;
 	struct device_list device_list;
 	struct lguest_boot_info *boot = (void *)0;
 	const char *initrd_name = NULL;
+	Elf_Ehdr ehdr;
 
 	device_list.max_infd = -1;
 	device_list.dev = NULL;
@@ -958,8 +1053,8 @@ int main(int argc, char *argv[])
 	map_zeroed_pages(0, mem / getpagesize());
 
 	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
-			    &page_offset);
+	elf_fd = open_or_die(argv[optind+1], O_RDONLY);
+	start = load_kernel(elf_fd, &page_offset, &ehdr);
 
 	/* Write the device descriptors into memory. */
 	map_device_descriptors(&device_list, mem);
@@ -969,7 +1064,11 @@ int main(int argc, char *argv[])
 		boot->initrd_size = load_initrd(initrd_name, mem);
 
 	/* Set up the initial linar pagetables. */
+#ifndef __x86_64__
 	pgdir = setup_pagetables(mem, boot->initrd_size, page_offset);
+#else
+	pgdir = find_pgt_symbol(elf_fd, &ehdr, page_offset);
+#endif
 
 	/* Give the guest the boot information it needs. */
 	concat(boot->cmdline, argv+optind+2);
Index: linux-2.6-mm/Documentation/lguest/x86_64/defines
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/x86_64/defines
@@ -0,0 +1 @@
+LGUEST_GUEST_TOP := 0x7f000000
Index: linux-2.6-mm/Documentation/lguest/x86_64/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/x86_64/lguest_defs.h
@@ -0,0 +1,11 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include <asm/vsyscall.h>
+
+#include "../../../include/asm/lguest_user.h"
+typedef Elf64_Ehdr Elf_Ehdr;
+typedef Elf64_Phdr Elf_Phdr;
+#define ELF_MACHINE EM_X86_64
+
+#endif
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization

Reply via email to