Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 2312

kernel-2.6.18-238.el5.src.rpm

From: Kimball Murray <kmurray@redhat.com>
Date: Fri, 22 Jun 2007 11:24:13 -0400
Subject: [mm] xen: memory tracking cleanups
Message-id: 20070622152128.10093.86634.sendpatchset@dhcp83-86.boston.redhat.com
O-Subject: [RHEL5 U1 Patch 1/1 (revised)] memory tracking patch only partially applied to Xen kernel (BZ-242514)
Bugzilla: 242514

This is the revised memory tracking patch that fixes the x86_64 regression
caused by its predecessor.  The previous patch changed pte_mkclean to use
pte_val instead of __pte_val.  Under Xen, this is a big problem because the
Xen version of pte_val behaves differently depending on whether the input
pte refers to a USER or KERNEL page.  With this error, pte_mkclean was not
only clearing the dirty bit(s), but also changing the address bits, resulting
in general chaos.

The new patch uses __pte_val inside of pte_mkclean.  I have heard back from
all the disgruntled test folks who had to witness this fiasco, and every
one has reported that the revised patch fixes the regression, as well as my
own testing.

For those that have reviewed the previous patch, and only want to look at
what changed between the two patches, here is the diff:

------------------------------------------------------------

Acked-by: Rik van Riel <riel@redhat.com>

diff --git a/arch/x86_64/mm/track.c b/arch/x86_64/mm/track.c
index 1566f85..fb606d7 100644
--- a/arch/x86_64/mm/track.c
+++ b/arch/x86_64/mm/track.c
@@ -48,6 +48,10 @@ void do_mm_track_pte(void * val)
 	if (pfn >= mm_tracking_struct.bitcnt)
 		return;
 
+#ifdef CONFIG_XEN
+	pfn = pfn_to_mfn(pfn);
+#endif
+
 	if (!test_and_set_bit(pfn, mm_tracking_struct.vector))
 		atomic_inc(&mm_tracking_struct.count);
 }
@@ -92,6 +96,10 @@ static inline void track_as_pte(void *val) {
 	if (pfn >= mm_tracking_struct.bitcnt)
 		return;
 
+#ifdef CONFIG_XEN
+	pfn = pfn_to_mfn(pfn);
+#endif
+
 	if (!test_and_set_bit(pfn, mm_tracking_struct.vector))
 		atomic_inc(&mm_tracking_struct.count);
 }
@@ -117,6 +125,10 @@ void do_mm_track_phys(void *val)
 	if (pfn >= mm_tracking_struct.bitcnt)
 		return;
 
+#ifdef CONFIG_XEN
+	pfn = pfn_to_mfn(pfn);
+#endif
+
 	if (!test_and_set_bit(pfn, mm_tracking_struct.vector))
 		atomic_inc(&mm_tracking_struct.count);
 }
diff --git a/include/asm-x86_64/mach-xen/asm/pgtable.h b/include/asm-x86_64/mach-xen/asm/pgtable.h
index a104104..e6eddf9 100644
--- a/include/asm-x86_64/mach-xen/asm/pgtable.h
+++ b/include/asm-x86_64/mach-xen/asm/pgtable.h
@@ -14,6 +14,7 @@
 #include <linux/threads.h>
 #include <linux/sched.h>
 #include <asm/pda.h>
+#include <asm/mm_track.h>
 #ifdef CONFIG_XEN
 #include <asm/hypervisor.h>
 
@@ -98,19 +99,29 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
 #define pgd_none(x)	(!pgd_val(x))
 #define pud_none(x)	(!pud_val(x))
 
-#define set_pte_batched(pteptr, pteval) \
-	queue_l1_entry_update(pteptr, (pteval))
-
 extern inline int pud_present(pud_t pud)	{ return !pud_none(pud); }
 
 static inline void set_pte(pte_t *dst, pte_t val)
 {
+	mm_track_pte(dst);
 	*dst = val;
 }
 
-#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
-#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
-#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
+static inline void set_pmd(pmd_t *pmdptr, pmd_t pmdval)
+{
+	mm_track_pmd(pmdptr);
+	xen_l2_entry_update(pmdptr, (pmdval));
+}
+static inline void set_pud(pud_t *pudptr, pud_t pudval)
+{
+	mm_track_pud(pudptr);
+	xen_l3_entry_update(pudptr, (pudval));	
+}
+static inline void set_pgd(pgd_t *pgdptr, pgd_t pgdval)
+{
+	mm_track_pgd(pgdptr);
+	xen_l4_entry_update(pgdptr, (pgdval));	
+}
 
 static inline void pud_clear (pud_t * pud)
 {
@@ -137,7 +148,11 @@ static inline void pgd_clear (pgd_t * pgd)
  * each domain will have separate page tables, with their own versions of
  * accessed & dirty state.
  */
-#define ptep_get_and_clear(mm,addr,xp)	__pte_ma(xchg(&(xp)->pte, 0))
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
+{
+	mm_track_pte(xp);
+	return __pte_ma(xchg(&(xp)->pte, 0));
+}
 
 #if 0
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
@@ -156,6 +171,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 	pte_t pte;
 	if (full) {
 		pte = *ptep;
+		mm_track_pte(ptep);
 		*ptep = __pte(0);
 	} else {
 		pte = ptep_get_and_clear(mm, addr, ptep);
@@ -195,6 +211,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define _PAGE_BIT_DIRTY		6
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+#define _PAGE_BIT_SOFTDIRTY	9	/* save dirty state when hdw dirty bit cleared */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
 #define _PAGE_PRESENT	0x001
@@ -207,6 +224,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define _PAGE_PSE	0x080	/* 2MB page */
 #define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
 #define _PAGE_GLOBAL	0x100	/* Global TLB entry */
+#define _PAGE_SOFTDIRTY	0x200
 
 #define _PAGE_PROTNONE	0x080	/* If not present */
 #define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
@@ -214,7 +232,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
-#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SOFTDIRTY)
 
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
@@ -329,7 +347,7 @@ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 static inline int pte_user(pte_t pte)		{ return __pte_val(pte) & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return __pte_val(pte) & _PAGE_USER; }
 static inline int pte_exec(pte_t pte)		{ return __pte_val(pte) & _PAGE_USER; }
-static inline int pte_dirty(pte_t pte)		{ return __pte_val(pte) & _PAGE_DIRTY; }
+static inline int pte_dirty(pte_t pte)		{ return __pte_val(pte) & (_PAGE_DIRTY | _PAGE_SOFTDIRTY); }
 static inline int pte_young(pte_t pte)		{ return __pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return __pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return __pte_val(pte) & _PAGE_FILE; }
@@ -337,7 +355,12 @@ static inline int pte_huge(pte_t pte)		{ return __pte_val(pte) & _PAGE_PSE; }
 
 static inline pte_t pte_rdprotect(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_USER; return pte; }
 static inline pte_t pte_exprotect(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_USER; return pte; }
-static inline pte_t pte_mkclean(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	mm_track_pte(&pte);
+	__pte_val(pte) &= ~(_PAGE_SOFTDIRTY|_PAGE_DIRTY);
+	return pte;
+}
 static inline pte_t pte_mkold(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
 static inline pte_t pte_wrprotect(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_RW; return pte; }
 static inline pte_t pte_mkread(pte_t pte)	{ __pte_val(pte) |= _PAGE_USER; return pte; }