1 files changed, 464 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0013-hugepages-fix-use-after-free-bug-in-quota-handling.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0013-hugepages-fix-use-after-free-bug-in-quota-handling.patch
new file mode 100644
index 00000000..bdfa3864
--- /dev/null
+++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0013-hugepages-fix-use-after-free-bug-in-quota-handling.patch
@@ -0,0 +1,464 @@
+From 5babdc7487f6c78c06d8e085efe841d91a77ff48 Mon Sep 17 00:00:00 2001
+From: David Gibson <david@gibson.dropbear.id.au>
+Date: Wed, 21 Mar 2012 16:34:12 -0700
+Subject: [PATCH 013/109] hugepages: fix use after free bug in "quota"
+ handling
+commit 90481622d75715bfcb68501280a917dbfe516029 upstream.
+hugetlbfs_{get,put}_quota() are badly named.  They don't interact with the
+general quota handling code, and they don't much resemble its behaviour.
+Rather than being about maintaining limits on on-disk block usage by
+particular users, they are instead about maintaining limits on in-memory
+page usage (including anonymous MAP_PRIVATE copied-on-write pages)
+associated with a particular hugetlbfs filesystem instance.
+Worse, they work by having callbacks to the hugetlbfs filesystem code from
+the low-level page handling code, in particular from free_huge_page().
+This is a layering violation of itself, but more importantly, if the
+kernel does a get_user_pages() on hugepages (which can happen from KVM
+amongst others), then the free_huge_page() can be delayed until after the
+associated inode has already been freed.  If an unmount occurs at the
+wrong time, even the hugetlbfs superblock where the "quota" limits are
+stored may have been freed.
+Andrew Barry proposed a patch to fix this by having hugepages, instead of
+storing a pointer to their address_space and reaching the superblock from
+there, had the hugepages store pointers directly to the superblock,
+bumping the reference count as appropriate to avoid it being freed.
+Andrew Morton rejected that version, however, on the grounds that it made
+the existing layering violation worse.
+This is a reworked version of Andrew's patch, which removes the extra, and
+some of the existing, layering violation.  It works by introducing the
+concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
+finite logical pool of hugepages to allocate from.  hugetlbfs now creates
+a subpool for each filesystem instance with a page limit set, and a
+pointer to the subpool gets added to each allocated hugepage, instead of
+the address_space pointer used now.  The subpool has its own lifetime and
+is only freed once all pages in it _and_ all other references to it (i.e.
+superblocks) are gone.
+subpools are optional - a NULL subpool pointer is taken by the code to
+mean that no subpool limits are in effect.
+Previous discussion of this bug found in:  "Fix refcounting in hugetlbfs
+quota handling.". See:  https://lkml.org/lkml/2011/8/11/28 or
+http://marc.info/?l=linux-mm&m=126928970510627&w=1
+v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
+alloc_huge_page() - since it already takes the vma, it is not necessary.
+Signed-off-by: Andrew Barry <abarry@cray.com>
+Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Minchan Kim <minchan.kim@gmail.com>
+Cc: Hillf Danton <dhillf@gmail.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[bwh: Backported to 3.2: adjust context to apply after commit
+ c50ac050811d6485616a193eb0f37bfbd191cc89 'hugetlb: fix resv_map leak in
+ error path', backported in 3.2.20]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ fs/hugetlbfs/inode.c    |   54 +++++++-----------
+ include/linux/hugetlb.h |   14 ++++--
+ mm/hugetlb.c            |  135 +++++++++++++++++++++++++++++++++++++---------
+ 3 files changed, 139 insertions(+), 64 deletions(-)
+diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
+index 2d0ca24..ebc2f4d 100644
+--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
+@@ -592,9 +592,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+                spin_lock(&sbinfo->stat_lock);
+                /* If no limits set, just report 0 for max/free/used
+                 * blocks, like simple_statfs() */
+-               if (sbinfo->max_blocks >= 0) {
+-                       buf->f_blocks = sbinfo->max_blocks;
+-                       buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+               if (sbinfo->spool) {
+                       long free_pages;
+
+                       spin_lock(&sbinfo->spool->lock);
+                       buf->f_blocks = sbinfo->spool->max_hpages;
+                       free_pages = sbinfo->spool->max_hpages
+                               - sbinfo->spool->used_hpages;
+                       buf->f_bavail = buf->f_bfree = free_pages;
+                       spin_unlock(&sbinfo->spool->lock);
+                        buf->f_files = sbinfo->max_inodes;
+                        buf->f_ffree = sbinfo->free_inodes;
+                }
+@@ -610,6 +616,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
+ 
+        if (sbi) {
+                sb->s_fs_info = NULL;
+
+               if (sbi->spool)
+                       hugepage_put_subpool(sbi->spool);
+
+                kfree(sbi);
+        }
+ }
+@@ -841,10 +851,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+        sb->s_fs_info = sbinfo;
+        sbinfo->hstate = config.hstate;
+        spin_lock_init(&sbinfo->stat_lock);
+-       sbinfo->max_blocks = config.nr_blocks;
+-       sbinfo->free_blocks = config.nr_blocks;
+        sbinfo->max_inodes = config.nr_inodes;
+        sbinfo->free_inodes = config.nr_inodes;
+       sbinfo->spool = NULL;
+       if (config.nr_blocks != -1) {
+               sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+               if (!sbinfo->spool)
+                       goto out_free;
+       }
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_blocksize = huge_page_size(config.hstate);
+        sb->s_blocksize_bits = huge_page_shift(config.hstate);
+@@ -864,38 +878,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+        sb->s_root = root;
+        return 0;
+ out_free:
+       if (sbinfo->spool)
+               kfree(sbinfo->spool);
+        kfree(sbinfo);
+        return -ENOMEM;
+ }
+ 
+-int hugetlb_get_quota(struct address_space *mapping, long delta)
+-{
+-       int ret = 0;
+-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+-
+-       if (sbinfo->free_blocks > -1) {
+-               spin_lock(&sbinfo->stat_lock);
+-               if (sbinfo->free_blocks - delta >= 0)
+-                       sbinfo->free_blocks -= delta;
+-               else
+-                       ret = -ENOMEM;
+-               spin_unlock(&sbinfo->stat_lock);
+-       }
+-
+-       return ret;
+-}
+-
+-void hugetlb_put_quota(struct address_space *mapping, long delta)
+-{
+-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+-
+-       if (sbinfo->free_blocks > -1) {
+-               spin_lock(&sbinfo->stat_lock);
+-               sbinfo->free_blocks += delta;
+-               spin_unlock(&sbinfo->stat_lock);
+-       }
+-}
+-
+ static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+ {
+diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
+index d9d6c86..c5ed2f1 100644
+--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
+@@ -14,6 +14,15 @@ struct user_struct;
+ #include <linux/shm.h>
+ #include <asm/tlbflush.h>
+ 
+struct hugepage_subpool {
+       spinlock_t lock;
+       long count;
+       long max_hpages, used_hpages;
+};
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
+void hugepage_put_subpool(struct hugepage_subpool *spool);
+
+ int PageHuge(struct page *page);
+ 
+ void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+@@ -138,12 +147,11 @@ struct hugetlbfs_config {
+ };
+ 
+ struct hugetlbfs_sb_info {
+-       long    max_blocks;   /* blocks allowed */
+-       long    free_blocks;  /* blocks free */
+        long    max_inodes;   /* inodes allowed */
+        long    free_inodes;  /* inodes free */
+        spinlock_t      stat_lock;
+        struct hstate *hstate;
+       struct hugepage_subpool *spool;
+ };
+ 
+ 
+@@ -166,8 +174,6 @@ extern const struct file_operations hugetlbfs_file_operations;
+ extern const struct vm_operations_struct hugetlb_vm_ops;
+ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
+                                struct user_struct **user, int creat_flags);
+-int hugetlb_get_quota(struct address_space *mapping, long delta);
+-void hugetlb_put_quota(struct address_space *mapping, long delta);
+ 
+ static inline int is_file_hugepages(struct file *file)
+ {
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index 5f5c545..7c535b0 100644
+--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
+@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
+  */
+ static DEFINE_SPINLOCK(hugetlb_lock);
+ 
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+       bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+       spin_unlock(&spool->lock);
+
+       /* If no pages are used, and no other handles to the subpool
+        * remain, free the subpool the subpool remain */
+       if (free)
+               kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+       struct hugepage_subpool *spool;
+
+       spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+       if (!spool)
+               return NULL;
+
+       spin_lock_init(&spool->lock);
+       spool->count = 1;
+       spool->max_hpages = nr_blocks;
+       spool->used_hpages = 0;
+
+       return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+       spin_lock(&spool->lock);
+       BUG_ON(!spool->count);
+       spool->count--;
+       unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+                                     long delta)
+{
+       int ret = 0;
+
+       if (!spool)
+               return 0;
+
+       spin_lock(&spool->lock);
+       if ((spool->used_hpages + delta) <= spool->max_hpages) {
+               spool->used_hpages += delta;
+       } else {
+               ret = -ENOMEM;
+       }
+       spin_unlock(&spool->lock);
+
+       return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+                                      long delta)
+{
+       if (!spool)
+               return;
+
+       spin_lock(&spool->lock);
+       spool->used_hpages -= delta;
+       /* If hugetlbfs_put_super couldn't free spool due to
+       * an outstanding quota reference, free it now. */
+       unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+       return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+       return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
+
+ /*
+  * Region tracking -- allows tracking of reservations and instantiated pages
+  *                    across the pages in a mapping.
+@@ -533,9 +611,9 @@ static void free_huge_page(struct page *page)
+         */
+        struct hstate *h = page_hstate(page);
+        int nid = page_to_nid(page);
+-       struct address_space *mapping;
+       struct hugepage_subpool *spool =
+               (struct hugepage_subpool *)page_private(page);
+ 
+-       mapping = (struct address_space *) page_private(page);
+        set_page_private(page, 0);
+        page->mapping = NULL;
+        BUG_ON(page_count(page));
+@@ -551,8 +629,7 @@ static void free_huge_page(struct page *page)
+                enqueue_huge_page(h, page);
+        }
+        spin_unlock(&hugetlb_lock);
+-       if (mapping)
+-               hugetlb_put_quota(mapping, 1);
+       hugepage_subpool_put_pages(spool, 1);
+ }
+ 
+ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+@@ -966,11 +1043,12 @@ static void return_unused_surplus_pages(struct hstate *h,
+ /*
+  * Determine if the huge page at addr within the vma has an associated
+  * reservation.  Where it does not we will need to logically increase
+- * reservation and actually increase quota before an allocation can occur.
+- * Where any new reservation would be required the reservation change is
+- * prepared, but not committed.  Once the page has been quota'd allocated
+- * an instantiated the change should be committed via vma_commit_reservation.
+- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur.  Where any new reservation would be required the
+ * reservation change is prepared, but not committed.  Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation.  No action is required on
+ * failure.
+  */
+ static long vma_needs_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
+@@ -1019,24 +1097,24 @@ static void vma_commit_reservation(struct hstate *h,
+ static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                    unsigned long addr, int avoid_reserve)
+ {
+       struct hugepage_subpool *spool = subpool_vma(vma);
+        struct hstate *h = hstate_vma(vma);
+        struct page *page;
+-       struct address_space *mapping = vma->vm_file->f_mapping;
+-       struct inode *inode = mapping->host;
+        long chg;
+ 
+        /*
+-        * Processes that did not create the mapping will have no reserves and
+-        * will not have accounted against quota. Check that the quota can be
+-        * made before satisfying the allocation
+-        * MAP_NORESERVE mappings may also need pages and quota allocated
+-        * if no reserve mapping overlaps.
+        * Processes that did not create the mapping will have no
+        * reserves and will not have accounted against subpool
+        * limit. Check that the subpool limit can be made before
+        * satisfying the allocation MAP_NORESERVE mappings may also
+        * need pages and subpool limit allocated allocated if no reserve
+        * mapping overlaps.
+         */
+        chg = vma_needs_reservation(h, vma, addr);
+        if (chg < 0)
+                return ERR_PTR(-VM_FAULT_OOM);
+        if (chg)
+-               if (hugetlb_get_quota(inode->i_mapping, chg))
+               if (hugepage_subpool_get_pages(spool, chg))
+                        return ERR_PTR(-VM_FAULT_SIGBUS);
+ 
+        spin_lock(&hugetlb_lock);
+@@ -1046,12 +1124,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
+        if (!page) {
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+                if (!page) {
+-                       hugetlb_put_quota(inode->i_mapping, chg);
+                       hugepage_subpool_put_pages(spool, chg);
+                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                }
+        }
+ 
+-       set_page_private(page, (unsigned long) mapping);
+       set_page_private(page, (unsigned long)spool);
+ 
+        vma_commit_reservation(h, vma, addr);
+ 
+@@ -2081,6 +2159,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+ {
+        struct hstate *h = hstate_vma(vma);
+        struct resv_map *reservations = vma_resv_map(vma);
+       struct hugepage_subpool *spool = subpool_vma(vma);
+        unsigned long reserve;
+        unsigned long start;
+        unsigned long end;
+@@ -2096,7 +2175,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+ 
+                if (reserve) {
+                        hugetlb_acct_memory(h, -reserve);
+-                       hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                       hugepage_subpool_put_pages(spool, reserve);
+                }
+        }
+ }
+@@ -2326,7 +2405,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+        address = address & huge_page_mask(h);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+                + (vma->vm_pgoff >> PAGE_SHIFT);
+-       mapping = (struct address_space *)page_private(page);
+       mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+ 
+        /*
+         * Take the mapping lock for the duration of the table walk. As
+@@ -2865,11 +2944,12 @@ int hugetlb_reserve_pages(struct inode *inode,
+ {
+        long ret, chg;
+        struct hstate *h = hstate_inode(inode);
+       struct hugepage_subpool *spool = subpool_inode(inode);
+ 
+        /*
+         * Only apply hugepage reservation if asked. At fault time, an
+         * attempt will be made for VM_NORESERVE to allocate a page
+-        * and filesystem quota without using reserves
+        * without using reserves
+         */
+        if (vm_flags & VM_NORESERVE)
+                return 0;
+@@ -2898,19 +2978,19 @@ int hugetlb_reserve_pages(struct inode *inode,
+                goto out_err;
+        }
+ 
+-       /* There must be enough filesystem quota for the mapping */
+-       if (hugetlb_get_quota(inode->i_mapping, chg)) {
+       /* There must be enough pages in the subpool for the mapping */
+       if (hugepage_subpool_get_pages(spool, chg)) {
+                ret = -ENOSPC;
+                goto out_err;
+        }
+ 
+        /*
+         * Check enough hugepages are available for the reservation.
+-        * Hand back the quota if there are not
+        * Hand the pages back to the subpool if there are not
+         */
+        ret = hugetlb_acct_memory(h, chg);
+        if (ret < 0) {
+-               hugetlb_put_quota(inode->i_mapping, chg);
+               hugepage_subpool_put_pages(spool, chg);
+                goto out_err;
+        }
+ 
+@@ -2938,12 +3018,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+ {
+        struct hstate *h = hstate_inode(inode);
+        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+       struct hugepage_subpool *spool = subpool_inode(inode);
+ 
+        spin_lock(&inode->i_lock);
+        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
+        spin_unlock(&inode->i_lock);
+ 
+-       hugetlb_put_quota(inode->i_mapping, (chg - freed));
+       hugepage_subpool_put_pages(spool, (chg - freed));
+        hugetlb_acct_memory(h, -(chg - freed));
+ }
+ 
+-- 
+1.7.7.6

diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0013-hugepages-fix-use-after-free-bug-in-quota-handling.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0013-hugepages-fix-use-after-free-bug-in-quota-handling.patch new file mode 100644 index 00000000..bdfa3864 --- /dev/null +++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0013-hugepages-fix-use-after-free-bug-in-quota-handling.patch
@@ -0,0 +1,464 @@
	1	From 5babdc7487f6c78c06d8e085efe841d91a77ff48 Mon Sep 17 00:00:00 2001
	2	From: David Gibson <david@gibson.dropbear.id.au>
	3	Date: Wed, 21 Mar 2012 16:34:12 -0700
	4	Subject: [PATCH 013/109] hugepages: fix use after free bug in "quota"
	5	handling
	6
	7	commit 90481622d75715bfcb68501280a917dbfe516029 upstream.
	8
	9	hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
	10	general quota handling code, and they don't much resemble its behaviour.
	11	Rather than being about maintaining limits on on-disk block usage by
	12	particular users, they are instead about maintaining limits on in-memory
	13	page usage (including anonymous MAP_PRIVATE copied-on-write pages)
	14	associated with a particular hugetlbfs filesystem instance.
	15
	16	Worse, they work by having callbacks to the hugetlbfs filesystem code from
	17	the low-level page handling code, in particular from free_huge_page().
	18	This is a layering violation of itself, but more importantly, if the
	19	kernel does a get_user_pages() on hugepages (which can happen from KVM
	20	amongst others), then the free_huge_page() can be delayed until after the
	21	associated inode has already been freed. If an unmount occurs at the
	22	wrong time, even the hugetlbfs superblock where the "quota" limits are
	23	stored may have been freed.
	24
	25	Andrew Barry proposed a patch to fix this by having hugepages, instead of
	26	storing a pointer to their address_space and reaching the superblock from
	27	there, had the hugepages store pointers directly to the superblock,
	28	bumping the reference count as appropriate to avoid it being freed.
	29	Andrew Morton rejected that version, however, on the grounds that it made
	30	the existing layering violation worse.
	31
	32	This is a reworked version of Andrew's patch, which removes the extra, and
	33	some of the existing, layering violation. It works by introducing the
	34	concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
	35	finite logical pool of hugepages to allocate from. hugetlbfs now creates
	36	a subpool for each filesystem instance with a page limit set, and a
	37	pointer to the subpool gets added to each allocated hugepage, instead of
	38	the address_space pointer used now. The subpool has its own lifetime and
	39	is only freed once all pages in it _and_ all other references to it (i.e.
	40	superblocks) are gone.
	41
	42	subpools are optional - a NULL subpool pointer is taken by the code to
	43	mean that no subpool limits are in effect.
	44
	45	Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
	46	quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
	47	http://marc.info/?l=linux-mm&m=126928970510627&w=1
	48
	49	v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
	50	alloc_huge_page() - since it already takes the vma, it is not necessary.
	51
	52	Signed-off-by: Andrew Barry <abarry@cray.com>
	53	Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
	54	Cc: Hugh Dickins <hughd@google.com>
	55	Cc: Mel Gorman <mgorman@suse.de>
	56	Cc: Minchan Kim <minchan.kim@gmail.com>
	57	Cc: Hillf Danton <dhillf@gmail.com>
	58	Cc: Paul Mackerras <paulus@samba.org>
	59	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	60	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	61	[bwh: Backported to 3.2: adjust context to apply after commit
	62	c50ac050811d6485616a193eb0f37bfbd191cc89 'hugetlb: fix resv_map leak in
	63	error path', backported in 3.2.20]
	64	Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
	65	---
	66	fs/hugetlbfs/inode.c \| 54 +++++++-----------
	67	include/linux/hugetlb.h \| 14 ++++--
	68	mm/hugetlb.c \| 135 +++++++++++++++++++++++++++++++++++++---------
	69	3 files changed, 139 insertions(+), 64 deletions(-)
	70
	71	diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
	72	index 2d0ca24..ebc2f4d 100644
	73	--- a/fs/hugetlbfs/inode.c
	74	+++ b/fs/hugetlbfs/inode.c
	75	@@ -592,9 +592,15 @@ static int hugetlbfs_statfs(struct dentry dentry, struct kstatfs buf)
	76	spin_lock(&sbinfo->stat_lock);
	77	/* If no limits set, just report 0 for max/free/used
	78	* blocks, like simple_statfs() */
	79	- if (sbinfo->max_blocks >= 0) {
	80	- buf->f_blocks = sbinfo->max_blocks;
	81	- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
	82	+ if (sbinfo->spool) {
	83	+ long free_pages;
	84	+
	85	+ spin_lock(&sbinfo->spool->lock);
	86	+ buf->f_blocks = sbinfo->spool->max_hpages;
	87	+ free_pages = sbinfo->spool->max_hpages
	88	+ - sbinfo->spool->used_hpages;
	89	+ buf->f_bavail = buf->f_bfree = free_pages;
	90	+ spin_unlock(&sbinfo->spool->lock);
	91	buf->f_files = sbinfo->max_inodes;
	92	buf->f_ffree = sbinfo->free_inodes;
	93	}
	94	@@ -610,6 +616,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
	95
	96	if (sbi) {
	97	sb->s_fs_info = NULL;
	98	+
	99	+ if (sbi->spool)
	100	+ hugepage_put_subpool(sbi->spool);
	101	+
	102	kfree(sbi);
	103	}
	104	}
	105	@@ -841,10 +851,14 @@ hugetlbfs_fill_super(struct super_block sb, void data, int silent)
	106	sb->s_fs_info = sbinfo;
	107	sbinfo->hstate = config.hstate;
	108	spin_lock_init(&sbinfo->stat_lock);
	109	- sbinfo->max_blocks = config.nr_blocks;
	110	- sbinfo->free_blocks = config.nr_blocks;
	111	sbinfo->max_inodes = config.nr_inodes;
	112	sbinfo->free_inodes = config.nr_inodes;
	113	+ sbinfo->spool = NULL;
	114	+ if (config.nr_blocks != -1) {
	115	+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
	116	+ if (!sbinfo->spool)
	117	+ goto out_free;
	118	+ }
	119	sb->s_maxbytes = MAX_LFS_FILESIZE;
	120	sb->s_blocksize = huge_page_size(config.hstate);
	121	sb->s_blocksize_bits = huge_page_shift(config.hstate);
	122	@@ -864,38 +878,12 @@ hugetlbfs_fill_super(struct super_block sb, void data, int silent)
	123	sb->s_root = root;
	124	return 0;
	125	out_free:
	126	+ if (sbinfo->spool)
	127	+ kfree(sbinfo->spool);
	128	kfree(sbinfo);
	129	return -ENOMEM;
	130	}
	131
	132	-int hugetlb_get_quota(struct address_space *mapping, long delta)
	133	-{
	134	- int ret = 0;
	135	- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
	136	-
	137	- if (sbinfo->free_blocks > -1) {
	138	- spin_lock(&sbinfo->stat_lock);
	139	- if (sbinfo->free_blocks - delta >= 0)
	140	- sbinfo->free_blocks -= delta;
	141	- else
	142	- ret = -ENOMEM;
	143	- spin_unlock(&sbinfo->stat_lock);
	144	- }
	145	-
	146	- return ret;
	147	-}
	148	-
	149	-void hugetlb_put_quota(struct address_space *mapping, long delta)
	150	-{
	151	- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
	152	-
	153	- if (sbinfo->free_blocks > -1) {
	154	- spin_lock(&sbinfo->stat_lock);
	155	- sbinfo->free_blocks += delta;
	156	- spin_unlock(&sbinfo->stat_lock);
	157	- }
	158	-}
	159	-
	160	static struct dentry hugetlbfs_mount(struct file_system_type fs_type,
	161	int flags, const char dev_name, void data)
	162	{
	163	diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
	164	index d9d6c86..c5ed2f1 100644
	165	--- a/include/linux/hugetlb.h
	166	+++ b/include/linux/hugetlb.h
	167	@@ -14,6 +14,15 @@ struct user_struct;
	168	#include <linux/shm.h>
	169	#include <asm/tlbflush.h>
	170
	171	+struct hugepage_subpool {
	172	+ spinlock_t lock;
	173	+ long count;
	174	+ long max_hpages, used_hpages;
	175	+};
	176	+
	177	+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
	178	+void hugepage_put_subpool(struct hugepage_subpool *spool);
	179	+
	180	int PageHuge(struct page *page);
	181
	182	void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
	183	@@ -138,12 +147,11 @@ struct hugetlbfs_config {
	184	};
	185
	186	struct hugetlbfs_sb_info {
	187	- long max_blocks; /* blocks allowed */
	188	- long free_blocks; /* blocks free */
	189	long max_inodes; /* inodes allowed */
	190	long free_inodes; /* inodes free */
	191	spinlock_t stat_lock;
	192	struct hstate *hstate;
	193	+ struct hugepage_subpool *spool;
	194	};
	195
	196
	197	@@ -166,8 +174,6 @@ extern const struct file_operations hugetlbfs_file_operations;
	198	extern const struct vm_operations_struct hugetlb_vm_ops;
	199	struct file hugetlb_file_setup(const char name, size_t size, vm_flags_t acct,
	200	struct user_struct **user, int creat_flags);
	201	-int hugetlb_get_quota(struct address_space *mapping, long delta);
	202	-void hugetlb_put_quota(struct address_space *mapping, long delta);
	203
	204	static inline int is_file_hugepages(struct file *file)
	205	{
	206	diff --git a/mm/hugetlb.c b/mm/hugetlb.c
	207	index 5f5c545..7c535b0 100644
	208	--- a/mm/hugetlb.c
	209	+++ b/mm/hugetlb.c
	210	@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
	211	*/
	212	static DEFINE_SPINLOCK(hugetlb_lock);
	213
	214	+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
	215	+{
	216	+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
	217	+
	218	+ spin_unlock(&spool->lock);
	219	+
	220	+ /* If no pages are used, and no other handles to the subpool
	221	+ * remain, free the subpool the subpool remain */
	222	+ if (free)
	223	+ kfree(spool);
	224	+}
	225	+
	226	+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
	227	+{
	228	+ struct hugepage_subpool *spool;
	229	+
	230	+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
	231	+ if (!spool)
	232	+ return NULL;
	233	+
	234	+ spin_lock_init(&spool->lock);
	235	+ spool->count = 1;
	236	+ spool->max_hpages = nr_blocks;
	237	+ spool->used_hpages = 0;
	238	+
	239	+ return spool;
	240	+}
	241	+
	242	+void hugepage_put_subpool(struct hugepage_subpool *spool)
	243	+{
	244	+ spin_lock(&spool->lock);
	245	+ BUG_ON(!spool->count);
	246	+ spool->count--;
	247	+ unlock_or_release_subpool(spool);
	248	+}
	249	+
	250	+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
	251	+ long delta)
	252	+{
	253	+ int ret = 0;
	254	+
	255	+ if (!spool)
	256	+ return 0;
	257	+
	258	+ spin_lock(&spool->lock);
	259	+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
	260	+ spool->used_hpages += delta;
	261	+ } else {
	262	+ ret = -ENOMEM;
	263	+ }
	264	+ spin_unlock(&spool->lock);
	265	+
	266	+ return ret;
	267	+}
	268	+
	269	+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
	270	+ long delta)
	271	+{
	272	+ if (!spool)
	273	+ return;
	274	+
	275	+ spin_lock(&spool->lock);
	276	+ spool->used_hpages -= delta;
	277	+ /* If hugetlbfs_put_super couldn't free spool due to
	278	+ * an outstanding quota reference, free it now. */
	279	+ unlock_or_release_subpool(spool);
	280	+}
	281	+
	282	+static inline struct hugepage_subpool subpool_inode(struct inode inode)
	283	+{
	284	+ return HUGETLBFS_SB(inode->i_sb)->spool;
	285	+}
	286	+
	287	+static inline struct hugepage_subpool subpool_vma(struct vm_area_struct vma)
	288	+{
	289	+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
	290	+}
	291	+
	292	/*
	293	* Region tracking -- allows tracking of reservations and instantiated pages
	294	* across the pages in a mapping.
	295	@@ -533,9 +611,9 @@ static void free_huge_page(struct page *page)
	296	*/
	297	struct hstate *h = page_hstate(page);
	298	int nid = page_to_nid(page);
	299	- struct address_space *mapping;
	300	+ struct hugepage_subpool *spool =
	301	+ (struct hugepage_subpool *)page_private(page);
	302
	303	- mapping = (struct address_space *) page_private(page);
	304	set_page_private(page, 0);
	305	page->mapping = NULL;
	306	BUG_ON(page_count(page));
	307	@@ -551,8 +629,7 @@ static void free_huge_page(struct page *page)
	308	enqueue_huge_page(h, page);
	309	}
	310	spin_unlock(&hugetlb_lock);
	311	- if (mapping)
	312	- hugetlb_put_quota(mapping, 1);
	313	+ hugepage_subpool_put_pages(spool, 1);
	314	}
	315
	316	static void prep_new_huge_page(struct hstate h, struct page page, int nid)
	317	@@ -966,11 +1043,12 @@ static void return_unused_surplus_pages(struct hstate *h,
	318	/*
	319	* Determine if the huge page at addr within the vma has an associated
	320	* reservation. Where it does not we will need to logically increase
	321	- * reservation and actually increase quota before an allocation can occur.
	322	- * Where any new reservation would be required the reservation change is
	323	- * prepared, but not committed. Once the page has been quota'd allocated
	324	- * an instantiated the change should be committed via vma_commit_reservation.
	325	- * No action is required on failure.
	326	+ * reservation and actually increase subpool usage before an allocation
	327	+ * can occur. Where any new reservation would be required the
	328	+ * reservation change is prepared, but not committed. Once the page
	329	+ * has been allocated from the subpool and instantiated the change should
	330	+ * be committed via vma_commit_reservation. No action is required on
	331	+ * failure.
	332	*/
	333	static long vma_needs_reservation(struct hstate *h,
	334	struct vm_area_struct *vma, unsigned long addr)
	335	@@ -1019,24 +1097,24 @@ static void vma_commit_reservation(struct hstate *h,
	336	static struct page alloc_huge_page(struct vm_area_struct vma,
	337	unsigned long addr, int avoid_reserve)
	338	{
	339	+ struct hugepage_subpool *spool = subpool_vma(vma);
	340	struct hstate *h = hstate_vma(vma);
	341	struct page *page;
	342	- struct address_space *mapping = vma->vm_file->f_mapping;
	343	- struct inode *inode = mapping->host;
	344	long chg;
	345
	346	/*
	347	- * Processes that did not create the mapping will have no reserves and
	348	- * will not have accounted against quota. Check that the quota can be
	349	- * made before satisfying the allocation
	350	- * MAP_NORESERVE mappings may also need pages and quota allocated
	351	- * if no reserve mapping overlaps.
	352	+ * Processes that did not create the mapping will have no
	353	+ * reserves and will not have accounted against subpool
	354	+ * limit. Check that the subpool limit can be made before
	355	+ * satisfying the allocation MAP_NORESERVE mappings may also
	356	+ * need pages and subpool limit allocated allocated if no reserve
	357	+ * mapping overlaps.
	358	*/
	359	chg = vma_needs_reservation(h, vma, addr);
	360	if (chg < 0)
	361	return ERR_PTR(-VM_FAULT_OOM);
	362	if (chg)
	363	- if (hugetlb_get_quota(inode->i_mapping, chg))
	364	+ if (hugepage_subpool_get_pages(spool, chg))
	365	return ERR_PTR(-VM_FAULT_SIGBUS);
	366
	367	spin_lock(&hugetlb_lock);
	368	@@ -1046,12 +1124,12 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
	369	if (!page) {
	370	page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
	371	if (!page) {
	372	- hugetlb_put_quota(inode->i_mapping, chg);
	373	+ hugepage_subpool_put_pages(spool, chg);
	374	return ERR_PTR(-VM_FAULT_SIGBUS);
	375	}
	376	}
	377
	378	- set_page_private(page, (unsigned long) mapping);
	379	+ set_page_private(page, (unsigned long)spool);
	380
	381	vma_commit_reservation(h, vma, addr);
	382
	383	@@ -2081,6 +2159,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
	384	{
	385	struct hstate *h = hstate_vma(vma);
	386	struct resv_map *reservations = vma_resv_map(vma);
	387	+ struct hugepage_subpool *spool = subpool_vma(vma);
	388	unsigned long reserve;
	389	unsigned long start;
	390	unsigned long end;
	391	@@ -2096,7 +2175,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
	392
	393	if (reserve) {
	394	hugetlb_acct_memory(h, -reserve);
	395	- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
	396	+ hugepage_subpool_put_pages(spool, reserve);
	397	}
	398	}
	399	}
	400	@@ -2326,7 +2405,7 @@ static int unmap_ref_private(struct mm_struct mm, struct vm_area_struct vma,
	401	address = address & huge_page_mask(h);
	402	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
	403	+ (vma->vm_pgoff >> PAGE_SHIFT);
	404	- mapping = (struct address_space *)page_private(page);
	405	+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
	406
	407	/*
	408	* Take the mapping lock for the duration of the table walk. As
	409	@@ -2865,11 +2944,12 @@ int hugetlb_reserve_pages(struct inode *inode,
	410	{
	411	long ret, chg;
	412	struct hstate *h = hstate_inode(inode);
	413	+ struct hugepage_subpool *spool = subpool_inode(inode);
	414
	415	/*
	416	* Only apply hugepage reservation if asked. At fault time, an
	417	* attempt will be made for VM_NORESERVE to allocate a page
	418	- * and filesystem quota without using reserves
	419	+ * without using reserves
	420	*/
	421	if (vm_flags & VM_NORESERVE)
	422	return 0;
	423	@@ -2898,19 +2978,19 @@ int hugetlb_reserve_pages(struct inode *inode,
	424	goto out_err;
	425	}
	426
	427	- /* There must be enough filesystem quota for the mapping */
	428	- if (hugetlb_get_quota(inode->i_mapping, chg)) {
	429	+ /* There must be enough pages in the subpool for the mapping */
	430	+ if (hugepage_subpool_get_pages(spool, chg)) {
	431	ret = -ENOSPC;
	432	goto out_err;
	433	}
	434
	435	/*
	436	* Check enough hugepages are available for the reservation.
	437	- * Hand back the quota if there are not
	438	+ * Hand the pages back to the subpool if there are not
	439	*/
	440	ret = hugetlb_acct_memory(h, chg);
	441	if (ret < 0) {
	442	- hugetlb_put_quota(inode->i_mapping, chg);
	443	+ hugepage_subpool_put_pages(spool, chg);
	444	goto out_err;
	445	}
	446
	447	@@ -2938,12 +3018,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
	448	{
	449	struct hstate *h = hstate_inode(inode);
	450	long chg = region_truncate(&inode->i_mapping->private_list, offset);
	451	+ struct hugepage_subpool *spool = subpool_inode(inode);
	452
	453	spin_lock(&inode->i_lock);
	454	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
	455	spin_unlock(&inode->i_lock);
	456
	457	- hugetlb_put_quota(inode->i_mapping, (chg - freed));
	458	+ hugepage_subpool_put_pages(spool, (chg - freed));
	459	hugetlb_acct_memory(h, -(chg - freed));
	460	}
	461
	462	--
	463	1.7.7.6
	464