POC of kernel supported huge page backed stacks.  This simply enables it in 
kernel, to setup
a huge page backed stack you need a user-space utility that sets the 
HUGE_PAGE_STACK
personality flag for a process.  This patch does not enable full huge page 
backed stack
functionality, it will fail if a process stack exceeds one huge page as 
allocation and
alignment across multiple huge pages have not been addressed.  Stack 
randomization is
not currently supported on with huge page stacks so when running with 
HUGE_PAGE_STACK set
/proc/sys/kernel/randomize_va_space to 0

Sample user-space utility to follow.

Based on 2.6.25-rc9

Signed-off-by: Eric Munson <[EMAIL PROTECTED]>

---

 fs/binfmt_elf.c               |   17 +++++++++--
 fs/exec.c                     |   63 ++++++++++++++++++++++++++++++++++++++----
 fs/hugetlbfs/inode.c          |   22 ++++++++------
 include/asm-ia64/page.h       |    1
 include/asm-powerpc/page_64.h |    1
 include/asm-sparc64/page.h    |    3 ++
 include/asm-x86/page.h        |    3 ++
 include/linux/hugetlb.h       |    4 +-
 include/linux/personality.h   |    3 ++
 ipc/shm.c                     |    2 -
 10 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5e1a4fb..4a0b3ef 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -518,12 +518,23 @@ static unsigned long randomize_stack_top(unsigned long 
stack_top)
        if ((current->flags & PF_RANDOMIZE) &&
                !(current->personality & ADDR_NO_RANDOMIZE)) {
                random_variable = get_random_int() & STACK_RND_MASK;
-               random_variable <<= PAGE_SHIFT;
+               
+               if (get_personality & HUGE_PAGE_STACK) {
+                       random_variable >>= (HPAGE_SHIFT - PAGE_SHIFT);
+                       random_variable <<= HPAGE_SHIFT;
+               } else
+                       random_variable <<= PAGE_SHIFT;
        }
 #ifdef CONFIG_STACK_GROWSUP
-       return PAGE_ALIGN(stack_top) + random_variable;
+       if (get_personality & HUGE_PAGE_STACK)
+               return HPAGE_ALIGN(stack_top) + random_variable;
+       else
+               return PAGE_ALIGN(stack_top) + random_variable;
 #else
-       return PAGE_ALIGN(stack_top) - random_variable;
+       if (get_personality & HUGE_PAGE_STACK)
+               return HPAGE_ALIGN(stack_top) - random_variable;
+       else
+               return PAGE_ALIGN(stack_top) - random_variable;
 #endif
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 54a0a55..aab2a27 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/hugetlb.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -224,11 +225,31 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        int err = -ENOMEM;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;
+       struct file *hugefile;
+       char fname[64] = {0};
+       char hpagestack = !((get_personality & HUGE_PAGE_STACK) == 0 );
 
        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
                goto err;
 
+       if (hpagestack) {
+               sprintf(fname, "htlbfs%lu", current->pid);
+               hugefile = hugetlb_file_setup(fname, HPAGE_SIZE, 0);
+               if (unlikely(IS_ERR_VALUE(hugefile))) {
+                       printk(KERN_DEBUG "Huge page backed stack unavailable 
for process %lu.\n", current->pid);
+                       /*
+                        * If huge pages are not available for this stack fall
+                        * fall back to normal pages for execution instead of
+                        * failing.
+                        */
+                       set_personality(get_personality & (~HUGE_PAGE_STACK));
+                       hpagestack = 0;
+               } else
+                       vma->vm_file = hugefile;
+       }
+
+
        down_write(&mm->mmap_sem);
        vma->vm_mm = mm;
 
@@ -239,9 +260,17 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * configured yet.
         */
        vma->vm_end = STACK_TOP_MAX;
-       vma->vm_start = vma->vm_end - PAGE_SIZE;
+       if (hpagestack)
+               vma->vm_start = vma->vm_end - HPAGE_SIZE;
+       else
+               vma->vm_start = vma->vm_end - PAGE_SIZE;
 
        vma->vm_flags = VM_STACK_FLAGS;
+       if (hpagestack) {
+               vma->vm_flags |= VM_HUGETLB;
+               /* Stack randomization is currently not supported on huge pages 
*/
+               vma->vm_flags |= ADDR_NO_RANDOMIZE;
+       }
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        err = insert_vm_struct(mm, vma);
        if (err) {
@@ -575,6 +604,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
        struct vm_area_struct *prev = NULL;
        unsigned long vm_flags;
        unsigned long stack_base;
+       unsigned long stack_pad;
+       char hpagestack = !((get_personality & HUGE_PAGE_STACK) == 0);
 
 #ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size to 1GB */
@@ -586,16 +617,25 @@ int setup_arg_pages(struct linux_binprm *bprm,
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;
 
-       stack_base = PAGE_ALIGN(stack_top - stack_base);
+       if (hpagestack)
+               stack_base = HPAGE_ALIGN(stack_top - stack_base);
+       else
+               stack_base = PAGE_ALIGN(stack_top - stack_base);
 
        stack_shift = vma->vm_start - stack_base;
+       if (hpagestack)
+               stack_shift = ALIGN(stack_shift, HPAGE_SIZE);
        mm->arg_start = bprm->p - stack_shift;
        bprm->p = vma->vm_end - stack_shift;
 #else
        stack_top = arch_align_stack(stack_top);
-       stack_top = PAGE_ALIGN(stack_top);
+       if (hpagestack)
+               stack_top = HPAGE_ALIGN(stack_top);
+       else
+               stack_top = PAGE_ALIGN(stack_top);
        stack_shift = vma->vm_end - stack_top;
-
+       if (hpagestack)
+               stack_shift = ALIGN(stack_shift, HPAGE_SIZE);
        bprm->p -= stack_shift;
        mm->arg_start = bprm->p;
 #endif
@@ -633,11 +673,22 @@ int setup_arg_pages(struct linux_binprm *bprm,
                }
        }
 
+       stack_pad = EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+       if (hpagestack)
+               stack_pad = ALIGN(stack_pad, HPAGE_SIZE);
+
 #ifdef CONFIG_STACK_GROWSUP
-       stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+       if (vma->vm_end - vma->vm_start < stack_pad)
+               stack_base = vma->vm_end + stack_pad;
+       else
+               stack_base = vma->vm_end;
 #else
-       stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+       if (vma->vm_end - vma->vm_start < stack_pad)
+               stack_base = vma->vm_start - stack_pad;
+       else
+               stack_base = vma->vm_start;
 #endif
+
        ret = expand_stack(vma, stack_base);
        if (ret)
                ret = -EFAULT;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6846785..0ca7be2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -488,7 +488,7 @@ out:
 }
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
-                                       gid_t gid, int mode, dev_t dev)
+                                       gid_t gid, int mode, dev_t dev, char 
shared)
 {
        struct inode *inode;
 
@@ -504,7 +504,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block 
*sb, uid_t uid,
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                INIT_LIST_HEAD(&inode->i_mapping->private_list);
                info = HUGETLBFS_I(inode);
-               mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
+               if (shared)
+                       mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 
NULL);
                switch (mode & S_IFMT) {
                default:
                        init_special_inode(inode, mode, dev);
@@ -545,7 +546,7 @@ static int hugetlbfs_mknod(struct inode *dir,
        } else {
                gid = current->fsgid;
        }
-       inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, gid, mode, dev);
+       inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, gid, mode, dev, 
1);
        if (inode) {
                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
                d_instantiate(dentry, inode);
@@ -581,7 +582,7 @@ static int hugetlbfs_symlink(struct inode *dir,
                gid = current->fsgid;
 
        inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid,
-                                       gid, S_IFLNK|S_IRWXUGO, 0);
+                                       gid, S_IFLNK|S_IRWXUGO, 0, 1);
        if (inode) {
                int l = strlen(symname)+1;
                error = page_symlink(inode, symname, l);
@@ -845,7 +846,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, 
int silent)
        sb->s_op = &hugetlbfs_ops;
        sb->s_time_gran = 1;
        inode = hugetlbfs_get_inode(sb, config.uid, config.gid,
-                                       S_IFDIR | config.mode, 0);
+                                       S_IFDIR | config.mode, 0, 1);
        if (!inode)
                goto out_free;
 
@@ -910,7 +911,7 @@ static int can_do_hugetlb_shm(void)
                        can_do_mlock());
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size)
+struct file *hugetlb_file_setup(const char *name, size_t size, char shared)
 {
        int error = -ENOMEM;
        struct file *file;
@@ -921,10 +922,10 @@ struct file *hugetlb_file_setup(const char *name, size_t 
size)
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
 
-       if (!can_do_hugetlb_shm())
+       if (shared && !can_do_hugetlb_shm())
                return ERR_PTR(-EPERM);
 
-       if (!user_shm_lock(size, current->user))
+       if (shared && !user_shm_lock(size, current->user))
                return ERR_PTR(-ENOMEM);
 
        root = hugetlbfs_vfsmount->mnt_root;
@@ -937,7 +938,7 @@ struct file *hugetlb_file_setup(const char *name, size_t 
size)
 
        error = -ENOSPC;
        inode = hugetlbfs_get_inode(root->d_sb, current->fsuid,
-                               current->fsgid, S_IFREG | S_IRWXUGO, 0);
+                               current->fsgid, S_IFREG | S_IRWXUGO, 0, shared);
        if (!inode)
                goto out_dentry;
 
@@ -963,7 +964,8 @@ out_inode:
 out_dentry:
        dput(dentry);
 out_shm_unlock:
-       user_shm_unlock(size, current->user);
+       if (shared)
+               user_shm_unlock(size, current->user);
        return ERR_PTR(error);
 }
 
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 4999a6c..5ae70a2 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -52,6 +52,7 @@
 # define HPAGE_SHIFT_DEFAULT   28      /* check ia64 SDM for architecture 
supported size */
 # define HPAGE_SIZE            (__IA64_UL_CONST(1) << HPAGE_SHIFT)
 # define HPAGE_MASK            (~(HPAGE_SIZE - 1))
+# define HPAGE_ALIGN(addr)     (((addr)+HPAGE_SIZE-1)&HPAGE_MASK)
 
 # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 # define ARCH_HAS_HUGEPAGE_ONLY_RANGE
diff --git a/include/asm-powerpc/page_64.h b/include/asm-powerpc/page_64.h
index 67834ea..40a0d0d 100644
--- a/include/asm-powerpc/page_64.h
+++ b/include/asm-powerpc/page_64.h
@@ -90,6 +90,7 @@ extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE             ((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK             (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
+#define HPAGE_ALIGN(addr)      _ALIGN(addr, HPAGE_SIZE)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index e93a482..90174d5 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -117,6 +117,9 @@ typedef struct page *pgtable_t;
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
+/* to align the pointer to the (next) huge page boundary */
+#define HPAGE_ALIGN(addr)      (((addr)+HPAGE_SIZE-1)&HPAGE_MASK)
+
 /* We used to stick this into a hard-coded global register (%g4)
  * but that does not make sense anymore.
  */
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index a05b289..62d02f1 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -24,6 +24,9 @@
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
+/* to align the pointer to the (next) huge page boundary */
+#define HPAGE_ALIGN(addr)      (((addr)+HPAGE_SIZE-1)&HPAGE_MASK)
+
 #define __PHYSICAL_MASK                _AT(phys_addr_t, (_AC(1,ULL) << 
__PHYSICAL_MASK_SHIFT) - 1)
 #define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index addca4c..442cf42 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -165,7 +165,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct 
super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t);
+struct file *hugetlb_file_setup(const char *name, size_t, char shared);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
@@ -189,7 +189,7 @@ static inline void set_file_hugepages(struct file *file)
 
 #define is_file_hugepages(file)                0
 #define set_file_hugepages(file)       BUG()
-#define hugetlb_file_setup(name,size)  ERR_PTR(-ENOSYS)
+#define hugetlb_file_setup(name,size,shared)   ERR_PTR(-ENOSYS)
 
 #endif /* !CONFIG_HUGETLBFS */
 
diff --git a/include/linux/personality.h b/include/linux/personality.h
index 012cd55..2ba404c 100644
--- a/include/linux/personality.h
+++ b/include/linux/personality.h
@@ -22,6 +22,9 @@ extern int            __set_personality(unsigned long);
  * These occupy the top three bytes.
  */
 enum {
+       HUGE_PAGE_STACK =       0x0020000,      /* Attempt to use a huge page 
for the process
+                                                * stack
+                                                */
        ADDR_NO_RANDOMIZE =     0x0040000,      /* disable randomization of VA 
space */
        FDPIC_FUNCPTRS =        0x0080000,      /* userspace function ptrs 
point to descriptors
                                                 * (signal handling)
diff --git a/ipc/shm.c b/ipc/shm.c
index cc63fae..41f3201 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -401,7 +401,7 @@ static int newseg(struct ipc_namespace *ns, struct 
ipc_params *params)
        sprintf (name, "SYSV%08x", key);
        if (shmflg & SHM_HUGETLB) {
                /* hugetlb_file_setup takes care of mlock user accounting */
-               file = hugetlb_file_setup(name, size);
+               file = hugetlb_file_setup(name, size, 1);
                shp->mlock_user = current->user;
        } else {
                int acctflag = VM_ACCOUNT;

Attachment: signature.asc
Description: This is a digitally signed message part

-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to