From: Andy Whitcroft <apw@shadowen.org>

Two problems:

a) The memory overcommit code fails oto take into account all the pages
   which are pinned by being reserved for the hugetlbpage pool

b) We're performing overcommit accounting and checking on behalf of
   hugetlbpage vmas.

The main thrust is to ensure that VM_ACCOUNT actually only gets set on
vma's which are indeed accountable.  With that ensured much of the rest
comes out in the wash.  It also removes the hugetlb memory for the
overcommit_memory=2 case.


---

 25-akpm/arch/i386/mm/hugetlbpage.c    |    6 ++++++
 25-akpm/arch/ia64/mm/hugetlbpage.c    |    6 ++++++
 25-akpm/arch/ppc64/mm/hugetlbpage.c   |    6 ++++++
 25-akpm/arch/sh/mm/hugetlbpage.c      |    6 ++++++
 25-akpm/arch/sparc64/mm/hugetlbpage.c |    6 ++++++
 25-akpm/include/linux/hugetlb.h       |    5 +++++
 25-akpm/include/linux/mm.h            |    3 +++
 25-akpm/mm/mmap.c                     |    7 ++++++-
 25-akpm/mm/mprotect.c                 |    3 ++-
 25-akpm/security/commoncap.c          |    4 +++-
 25-akpm/security/dummy.c              |    4 +++-
 25-akpm/security/selinux/hooks.c      |    4 +++-
 12 files changed, 55 insertions(+), 5 deletions(-)

diff -puN arch/i386/mm/hugetlbpage.c~hugetlb-overcommit-fix arch/i386/mm/hugetlbpage.c
--- 25/arch/i386/mm/hugetlbpage.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.161352024 -0800
+++ 25-akpm/arch/i386/mm/hugetlbpage.c	2004-03-26 02:03:30.177349592 -0800
@@ -548,6 +548,12 @@ int is_hugepage_mem_enough(size_t size)
 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
 }
 
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
diff -puN arch/ia64/mm/hugetlbpage.c~hugetlb-overcommit-fix arch/ia64/mm/hugetlbpage.c
--- 25/arch/ia64/mm/hugetlbpage.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.163351720 -0800
+++ 25-akpm/arch/ia64/mm/hugetlbpage.c	2004-03-26 02:03:30.178349440 -0800
@@ -592,6 +592,12 @@ int is_hugepage_mem_enough(size_t size)
 	return 1;
 }
 
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+
 static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused)
 {
 	BUG();
diff -puN arch/ppc64/mm/hugetlbpage.c~hugetlb-overcommit-fix arch/ppc64/mm/hugetlbpage.c
--- 25/arch/ppc64/mm/hugetlbpage.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.164351568 -0800
+++ 25-akpm/arch/ppc64/mm/hugetlbpage.c	2004-03-26 02:03:30.179349288 -0800
@@ -912,6 +912,12 @@ int is_hugepage_mem_enough(size_t size)
 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpage_free;
 }
 
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return htlbpage_total * (HPAGE_SIZE / PAGE_SIZE);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
diff -puN arch/sparc64/mm/hugetlbpage.c~hugetlb-overcommit-fix arch/sparc64/mm/hugetlbpage.c
--- 25/arch/sparc64/mm/hugetlbpage.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.166351264 -0800
+++ 25-akpm/arch/sparc64/mm/hugetlbpage.c	2004-03-26 02:03:30.180349136 -0800
@@ -497,6 +497,12 @@ int is_hugepage_mem_enough(size_t size)
 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
 }
 
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
diff -puN include/linux/hugetlb.h~hugetlb-overcommit-fix include/linux/hugetlb.h
--- 25/include/linux/hugetlb.h~hugetlb-overcommit-fix	2004-03-26 02:03:30.167351112 -0800
+++ 25-akpm/include/linux/hugetlb.h	2004-03-26 02:03:30.180349136 -0800
@@ -19,6 +19,7 @@ int hugetlb_prefault(struct address_spac
 void huge_page_release(struct page *);
 int hugetlb_report_meminfo(char *);
 int is_hugepage_mem_enough(size_t);
+unsigned long hugetlb_total_pages(void);
 struct page *follow_huge_addr(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write);
 struct vm_area_struct *hugepage_vma(struct mm_struct *mm,
@@ -48,6 +49,10 @@ static inline int is_vm_hugetlb_page(str
 {
 	return 0;
 }
+static inline unsigned long hugetlb_total_pages(void)
+{
+	return 0;
+}
 
 #define follow_hugetlb_page(m,v,p,vs,a,b,i)	({ BUG(); 0; })
 #define follow_huge_addr(mm, vma, addr, write)	0
diff -puN include/linux/mm.h~hugetlb-overcommit-fix include/linux/mm.h
--- 25/include/linux/mm.h~hugetlb-overcommit-fix	2004-03-26 02:03:30.168350960 -0800
+++ 25-akpm/include/linux/mm.h	2004-03-26 02:03:30.181348984 -0800
@@ -112,6 +112,9 @@ struct vm_area_struct {
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 
+/* It makes sense to apply VM_ACCOUNT to this vma. */
+#define VM_MAYACCT(vma) (!!((vma)->vm_flags & VM_HUGETLB))
+
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
 #endif
diff -puN mm/mmap.c~hugetlb-overcommit-fix mm/mmap.c
--- 25/mm/mmap.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.170350656 -0800
+++ 25-akpm/mm/mmap.c	2004-03-26 02:03:30.182348832 -0800
@@ -489,9 +489,13 @@ unsigned long do_mmap_pgoff(struct file 
 	int correct_wcount = 0;
 	int error;
 	struct rb_node ** rb_link, * rb_parent;
+	int accountable = 1;
 	unsigned long charged = 0;
 
 	if (file) {
+		if (is_file_hugepages(file))
+			accountable = 0;
+
 		if (!file->f_op || !file->f_op->mmap)
 			return -ENODEV;
 
@@ -608,7 +612,8 @@ munmap_back:
 	    > current->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;
 
-	if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) {
+	if (accountable && (!(flags & MAP_NORESERVE) ||
+			sysctl_overcommit_memory > 1)) {
 		if (vm_flags & VM_SHARED) {
 			/* Check memory availability in shmem_file_setup? */
 			vm_flags |= VM_ACCOUNT;
diff -puN mm/mprotect.c~hugetlb-overcommit-fix mm/mprotect.c
--- 25/mm/mprotect.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.171350504 -0800
+++ 25-akpm/mm/mprotect.c	2004-03-26 02:03:30.182348832 -0800
@@ -173,7 +173,8 @@ mprotect_fixup(struct vm_area_struct *vm
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+		if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))
+				&& VM_MAYACCT(vma)) {
 			charged = (end - start) >> PAGE_SHIFT;
 			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
diff -puN security/commoncap.c~hugetlb-overcommit-fix security/commoncap.c
--- 25/security/commoncap.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.172350352 -0800
+++ 25-akpm/security/commoncap.c	2004-03-26 02:03:30.183348680 -0800
@@ -22,6 +22,7 @@
 #include <linux/netlink.h>
 #include <linux/ptrace.h>
 #include <linux/xattr.h>
+#include <linux/hugetlb.h>
 
 int cap_capable (struct task_struct *tsk, int cap)
 {
@@ -358,7 +359,8 @@ int cap_vm_enough_memory(long pages)
 		return -ENOMEM;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = (totalram_pages - hugetlb_total_pages())
+	       	* sysctl_overcommit_ratio / 100;
 	allowed += total_swap_pages;
 
 	if (atomic_read(&vm_committed_space) < allowed)
diff -puN security/dummy.c~hugetlb-overcommit-fix security/dummy.c
--- 25/security/dummy.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.174350048 -0800
+++ 25-akpm/security/dummy.c	2004-03-26 02:03:30.184348528 -0800
@@ -25,6 +25,7 @@
 #include <linux/netlink.h>
 #include <net/sock.h>
 #include <linux/xattr.h>
+#include <linux/hugetlb.h>
 
 static int dummy_ptrace (struct task_struct *parent, struct task_struct *child)
 {
@@ -146,7 +147,8 @@ static int dummy_vm_enough_memory(long p
 		return -ENOMEM;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = (totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100;
 	allowed += total_swap_pages;
 
 	if (atomic_read(&vm_committed_space) < allowed)
diff -puN security/selinux/hooks.c~hugetlb-overcommit-fix security/selinux/hooks.c
--- 25/security/selinux/hooks.c~hugetlb-overcommit-fix	2004-03-26 02:03:30.175349896 -0800
+++ 25-akpm/security/selinux/hooks.c	2004-03-26 02:03:30.186348224 -0800
@@ -59,6 +59,7 @@
 #include <net/af_unix.h>	/* for Unix socket types */
 #include <linux/parser.h>
 #include <linux/nfs_mount.h>
+#include <linux/hugetlb.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -1545,7 +1546,8 @@ static int selinux_vm_enough_memory(long
 		return -ENOMEM;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = (totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100;
 	allowed += total_swap_pages;
 
 	if (atomic_read(&vm_committed_space) < allowed)
diff -puN arch/sh/mm/hugetlbpage.c~hugetlb-overcommit-fix arch/sh/mm/hugetlbpage.c
--- 25/arch/sh/mm/hugetlbpage.c~hugetlb-overcommit-fix	2004-03-26 02:07:12.911488856 -0800
+++ 25-akpm/arch/sh/mm/hugetlbpage.c	2004-03-26 02:07:49.756887504 -0800
@@ -501,6 +501,12 @@ int is_hugepage_mem_enough(size_t size)
 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
 }
 
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the

_