Resend: patch: qemu + hugetlbfs..

john cooper Mon, 25 Aug 2008 16:07:58 -0700

This patch from over a month ago doesn't seem to have
made it into kvm-73 and may have been lost in the
shuffle.  Attached is essentially the same patch but
as applied to kvm-73, and validated relative to that
version.


In a nutshell the intention here is to allow
preallocation of guest huge page backed memory at
qemu initialization time to avoid a quirk in the
kernel's huge page accounting allowing overcommit
of huge pages.  Failure of the kernel to resolve a
guest fault to overcommitted huge page memory during
runtime results in sigkill termination of the guest.
This patch provides the option of avoiding such
behavior at the cost of up-front preallocation of
physical huge pages backing the guest.

-john


Anthony Liguori wrote:

john cooper wrote:

Anthony Liguori wrote:

john cooper wrote:

As it currently exists alloc_hpage_mem() is tied to
the notion of huge page allocation as it will reference
gethugepagesize() irrespective of *mem_path.  So even
in the case of tmpfs backed files, if the host kernel
has been configured with CONFIG_HUGETLBFS we will wind
up doing allocations of /dev/shm mapped files at
/proc/meminfo:Hugepagesize granularity.


Which is fine.  It just means we round -m values up to even numbers.


Well, yes it will round the allocation.  But from a
minimally sufficient 4KB boundary to that of 4MB/2MB
relative to a 32/64 bit x86 host which is excessive.

Probably not what was intended but probably not too
much of a concern as "-mem-path /dev/shm" is likely
only used in debug of this flag and associated logic.
I don't see it currently being worth the trouble to
correct from a squeaky clean POV, and doing so may
drag in far more than the header file we've just
booted above to deal with this architecture/config
dependency.
Renaming a function to a name that's less accurate seems bad to me.I don't mean to be pedantic, but it seems like a strange thing todo. I prefer it the way it was before.


I don't see any harm reverting the name.  But I do
believe it is largely cosmetic as given the above,
the current code does require some work to make it
independent of huge page assumptions.  Update attached.

-john


Looks good to me.

Acked-by: Anthony Liguori <[EMAIL PROTECTED]>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
[EMAIL PROTECTED]

 vl.c |   48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)
=================================================================
--- ./qemu/vl.c.PAORG
+++ ./qemu/vl.c
@@ -239,6 +239,7 @@ int autostart = 1;
 int time_drift_fix = 0;
 unsigned int kvm_shadow_memory = 0;
 const char *mem_path = NULL;
+int mem_prealloc = 1;  /* force preallocation of physical target memory */
 int hpagesize = 0;
 const char *cpu_vendor_string;
 #ifdef TARGET_ARM
@@ -8423,7 +8424,10 @@ static void help(int exitcode)
 #endif
            "-tdf            inject timer interrupts that got lost\n"
            "-kvm-shadow-memory megs set the amount of shadow pages to be 
allocated\n"
-           "-mem-path       set the path to hugetlbfs/tmpfs mounted directory, 
also enables allocation of guest memory with huge pages\n"
+           "-mem-path       set the path to hugetlbfs/tmpfs mounted directory, 
also\n"
+           "                enables allocation of guest memory with huge 
pages\n"
+           "-mem-prealloc   toggles preallocation of -mem-path backed physical 
memory\n"
+           "                at startup.  Default is enabled.\n"
           "-option-rom rom load a file, rom, into the option ROM space\n"
 #ifdef TARGET_SPARC
            "-prom-env variable=value  set OpenBIOS nvram variables\n"
@@ -8546,6 +8550,7 @@ enum {
     QEMU_OPTION_tdf,
     QEMU_OPTION_kvm_shadow_memory,
     QEMU_OPTION_mempath,
+    QEMU_OPTION_mem_prealloc
 };
 
 typedef struct QEMUOption {
@@ -8671,6 +8676,7 @@ const QEMUOption qemu_options[] = {
     { "tb-size", HAS_ARG, QEMU_OPTION_tb_size },
     { "icount", HAS_ARG, QEMU_OPTION_icount },
     { "mem-path", HAS_ARG, QEMU_OPTION_mempath },
+    { "mem-prealloc", 0, QEMU_OPTION_mem_prealloc },
     { NULL },
 };
 
@@ -8890,11 +8896,13 @@ static int gethugepagesize(void)
     return hugepagesize;
 }
 
+/* attempt to allocate memory mmap'ed to mem_path
+ */
 void *alloc_mem_area(unsigned long memory, const char *path)
 {
     char *filename;
     void *area;
-    int fd;
+    int fd, flags;
 
     if (asprintf(&filename, "%s/kvm.XXXXXX", path) == -1)
        return NULL;
@@ -8922,26 +8930,27 @@ void *alloc_mem_area(unsigned long memor
      */
     ftruncate(fd, memory);
 
-    area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
-    if (area == MAP_FAILED) {
-       perror("mmap");
-       close(fd);
-       return NULL;
-    }
-
-    return area;
+    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
+     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
+     * to sidestep this quirk.
+     */
+    flags = mem_prealloc ? MAP_POPULATE|MAP_SHARED : MAP_PRIVATE;
+    area = mmap(0, memory, PROT_READ|PROT_WRITE, flags, fd, 0);
+    if (area != MAP_FAILED)
+       return (area);
+    perror("alloc_mem_area: can't mmap hugetlbfs pages");
+    close(fd);
+    return (NULL);
 }
 
-void *qemu_alloc_physram(unsigned long memory)
+/* allocate guest memory as requested
+ */
+void *qemu_alloc_physram(unsigned long size)
 {
-    void *area = NULL;
-
     if (mem_path)
-       area = alloc_mem_area(memory, mem_path);
-    if (!area)
-       area = qemu_vmalloc(memory);
-
-    return area;
+       return (alloc_mem_area(size, mem_path));
+    else
+       return (qemu_vmalloc(size));
 }
 
 int main(int argc, char **argv)
@@ -9546,6 +9555,9 @@ int main(int argc, char **argv)
             case QEMU_OPTION_mempath:
                mem_path = optarg;
                break;
+            case QEMU_OPTION_mem_prealloc:
+               mem_prealloc = !mem_prealloc;
+               break;
             case QEMU_OPTION_name:
                 qemu_name = optarg;
                 break;

Resend: patch: qemu + hugetlbfs..

Reply via email to