Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 2189

kernel-2.6.18-238.el5.src.rpm

From: Dave Anderson <anderson@redhat.com>
Date: Fri, 27 Mar 2009 10:43:31 -0400
Subject: [mm] enable dumping of hugepages into core dumps
Message-id: 49CCE613.4090006@redhat.com
O-Subject: [RHEL5.4 PATCH] BZ #470411: enable dumping of hugepages into core dumps
Bugzilla: 470411
RH-Acked-by: Anton Arapov <aarapov@redhat.com>
RH-Acked-by: Larry Woodman <lwoodman@redhat.com>

Z #470411: 470411 -  FEAT RHEL5.4: hugepage coredump
https://bugzilla.redhat.com/show_bug.cgi?id=470411

This 2.6.28 feature enables the dumping of private and/or shared hugepages
into core dumps  with the addition of 2 new /proc/<pid>/coredump_filter bit
settings, MMF_DUMP_HUGETLB_PRIVATE and MMF_DUMP_HUGETLB_SHARED.

It is a backport of these two 2.6.28 commits:

   commit e575f111dc0f27044e170580e7de50985ab3e011
   Author: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
   Date:   Sat Oct 18 20:27:08 2008 -0700

     coredump_filter: add hugepage dumping

     Presently hugepage's vma has a VM_RESERVED flag in order not to be
     swapped.  But a VM_RESERVED vma isn't core dumped because this flag is
     often used for some kernel vmas (e.g.  vmalloc, sound related).

     Thus hugepages are never dumped and it can't be debugged easily.  Many
     developers want hugepages to be included into core-dump.

     However, We can't read generic VM_RESERVED area because this area is often
     IO mapping area.  then these area reading may change device state.  it is
     definitly undesiable side-effect.

     So adding a hugepage specific bit to the coredump filter is better.  It
     will be able to hugepage core dumping and doesn't cause any side-effect to
     any i/o devices.

     ... [ snip ] ...

     Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
     Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
     Cc: Hugh Dickins <hugh@veritas.com>
     Cc: William Irwin <wli@holomorphy.com>
     Cc: Adam Litke <agl@us.ibm.com>
     Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
     Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

   commit 4b2e38ad703541f7845c2d766426148b8d1aa329
   Author: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
   Date:   Sat Oct 18 20:27:10 2008 -0700

     hugepage: support ZERO_PAGE()

     Presently hugepage doesn't use zero page at all because zero page is only
     used for coredumping and hugepage can't core dump.

     However we have now implemented hugepage coredumping.  Therefore we should
     implement the zero page of hugepage.

     ... [ snip ] ...

     Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
     Cc: Adam Litke <agl@us.ibm.com>
     Cc: Hugh Dickins <hugh@veritas.com>
     Cc: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
     Cc: Mel Gorman <mel@skynet.ie>
     Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
     Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

The only functional difference from upstream is that MMF_DUMP_HUGETLB_PRIVATE
is not set in MMF_DUMP_FILTER_DEFAULT, leaving the default RHEL5 behaviour
unchanged.

Tested by me on all architectures, and also by Fujitsu on ia64.

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index b61eccb..92ec0c7 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1987,10 +1987,17 @@ The following 4 memory types are supported:
   - (bit 1) anonymous shared memory
   - (bit 2) file-backed private memory
   - (bit 3) file-backed shared memory
+  - (bit 4) ELF header pages in file-backed private memory areas (it is
+	    effective only if the bit 2 is cleared)
+  - (bit 5) hugetlb private memory
+  - (bit 6) hugetlb shared memory
 
   Note that MMIO pages such as frame buffer are never dumped and vDSO pages
   are always dumped regardless of the bitmask status.
 
+  Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only
+  effected by bit 5-6.
+
 Default value of coredump_filter is 0x3; this means all anonymous memory
 segments are dumped.
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index df3b48e..11a4ac7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1261,16 +1261,24 @@ static int dump_seek(struct file *file, loff_t off)
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
 				   unsigned long mm_flags)
 {
+#define FILTER(type)   (mm_flags & (1UL << MMF_DUMP_##type))
+
 	/* The vma can be set up to tell us the answer directly.  */
 	if (vma->vm_flags & VM_ALWAYSDUMP)
 		goto whole;
 
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+	}
+
 	/* Do not dump I/O mapped devices or special mappings */
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
-
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
 		if (vma->vm_file->f_dentry->d_inode->i_nlink == 0 ?
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3a6e26..d2ec53c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -322,7 +322,9 @@ typedef unsigned long mm_counter_t;
 #define MMF_DUMP_MAPPED_PRIVATE	2
 #define MMF_DUMP_MAPPED_SHARED	3
 #define MMF_DUMP_ELF_HEADERS	4
-#define MMF_DUMP_FILTER_BITS	5
+#define MMF_DUMP_HUGETLB_PRIVATE 5
+#define MMF_DUMP_HUGETLB_SHARED  6
+#define MMF_DUMP_FILTER_BITS	7
 #define MMF_DUMP_FILTER_MASK ((1 << MMF_DUMP_FILTER_BITS) - 1)
 #define MMF_DUMP_FILTER_DEFAULT \
 	((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c38767f..7dee06f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -648,6 +648,14 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
+{
+	if (!ptep || write || shared)
+		return 0;
+	else
+		return huge_pte_none(huge_ptep_get(ptep));
+}
+
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i,
@@ -656,6 +664,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
 	int remainder = *length;
+	int zeropage_ok = 0;
+	int shared = vma->vm_flags & VM_SHARED;
 
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
@@ -668,8 +678,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * first, for the page indexing below to work.
 		 */
 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+		if (huge_zeropage_ok(pte, write, shared))
+			zeropage_ok = 1;
 
-		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
+		if (!pte || 
+		    (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
 		    (write && !pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
@@ -689,10 +702,13 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
-			get_page(page);
+			if (zeropage_ok)
+				pages[i] = ZERO_PAGE(0);
+			else
+				pages[i] = page + pfn_offset;
+			get_page(pages[i]);
 			if (write && !PageGUP(page))
 				SetPageGUP(page);
-			pages[i] = page + pfn_offset;
 		}
 
 		if (vmas)