Sophie: kernel-2.6.18-194.11.1.el5 src

kernel-2.6.18-194.11.1.el5.src.rpm

From: Hendrik Brueckner <brueckner@redhat.com>
Date: Fri, 11 Dec 2009 10:11:56 -0500
Subject: [s390] kernel: correct TLB flush of page table entries
Message-id: <20091211101155.GC4579@redhat.com>
Patchwork-id: 21878
O-Subject: [RHEL5 U6 PATCH 1/1] s390 - kernel: correct TLB flush of page
	table entries
Bugzilla: 545527
RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com>

Description
-----------
Accesses via invalid TLB entries can cause a variaty of problems,
from simple crashes to subtle data consistency issues.

The invalidation of a valid page table entry while it is in
use by another cpu needs to be done with a special instruction
like IPTE, IDTE or CSP. The standard unmap code uses two steps
two remove an entry from a page table. First the invalid bit
in the page table entry is set and later a global TLB flush is
performed. While the page table entry is in transit another cpu
can form an invalid TLB entry.

The unmap code needs to check for potential concurrent access to
the pages that are about to be removed from the page table. If
the possibility exists that another cpu is operating on the page
table at the same time use the IPTE instruction to invalidate the
page table entry.

(For more technical details, see the upstream link below).

Bugzilla
--------
BZ 545527
https://bugzilla.redhat.com/show_bug.cgi?id=545527

Upstream status of the patch
----------------------------
The patch is upstream as of kernel version 2.6.24
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=ba8a9229ab9e80278c28ad68b15053f65b2b0a7c

Test status
-----------
The patch has been tested and fixes the problem.
The fix has been verified by the IBM test department.

Please ACK.

With best regards,

	Hendrik


diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index 151c838..e4c0bad 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -70,7 +70,6 @@ static inline void pgd_free(pgd_t *pgd)
  */
 #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
 #define pmd_free(x)                     do { } while (0)
-#define __pmd_free_tlb(tlb,x)		do { } while (0)
 #define pgd_populate(mm, pmd, pte)      BUG()
 #else /* __s390x__ */
 static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
@@ -90,12 +89,6 @@ static inline void pmd_free (pmd_t *pmd)
 	free_pages((unsigned long) pmd, PMD_ALLOC_ORDER);
 }
 
-#define __pmd_free_tlb(tlb,pmd)			\
-	do {					\
-		tlb_flush_mmu(tlb, 0, 0);	\
-		pmd_free(pmd);			\
-	 } while (0)
-
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
 {
 	pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd);
@@ -160,6 +153,4 @@ static inline void pte_free(struct page *pte)
         __free_page(pte);
 }
 
-#define __pte_free_tlb(tlb,pte) tlb_remove_page(tlb,pte)
-
 #endif /* _S390_PGALLOC_H */
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index ce3ce13..defa6f5 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -600,44 +600,80 @@ ptep_clear_flush_dirty(struct vm_area_struct *vma,
 	return ptep_test_and_clear_dirty(vma, address, ptep);
 }
 
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	pte_clear(mm, addr, ptep);
-	return pte;
-}
-
 static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
 {
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 #ifndef __s390x__
-		/* S390 has 1mb segments, we are emulating 4MB segments */
+		/* pto must point to the start of the segment table */
 		pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00);
 #else
 		/* ipte in zarch mode can do the math */
 		pte_t *pto = ptep;
 #endif
-		asm volatile ("ipte %2,%3"
-			      : "=m" (*ptep) : "m" (*ptep),
-				"a" (pto), "a" (address) );
+		asm volatile(
+			"       ipte    %2,%3"
+			: "=m" (*ptep) : "m" (*ptep),
+			"a" (pto), "a" (address));
 	}
+}
+
+static inline void ptep_invalidate(unsigned long address, pte_t *ptep)
+{
+	__ptep_ipte(address, ptep);
 	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
 }
 
+/*
+ * This is hard to understand. ptep_get_and_clear and ptep_clear_flush
+ * both clear the TLB for the unmapped pte. The reason is that
+ * ptep_get_and_clear is used in common code (e.g. change_pte_range)
+ * to modify an active pte. The sequence is
+ *   1) ptep_get_and_clear
+ *   2) set_pte_at
+ *   3) flush_tlb_range
+ * On s390 the tlb needs to get flushed with the modification of the pte
+ * if the pte is active. The only way how this can be implemented is to
+ * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range
+ * is a nop.
+ */
+#define ptep_get_and_clear(__mm, __address, __ptep)			\
+({									\
+	pte_t __pte = *(__ptep);					\
+	if (atomic_read(&(__mm)->mm_users) > 1 ||			\
+	    (__mm) != current->active_mm)				\
+		ptep_invalidate(__address, __ptep);			\
+	else								\
+		pte_clear((__mm), (__address), (__ptep));		\
+	__pte;								\
+})
+
 static inline pte_t
 ptep_clear_flush(struct vm_area_struct *vma,
 		 unsigned long address, pte_t *ptep)
 {
 	pte_t pte = *ptep;
-
-	__ptep_ipte(address, ptep);
+	ptep_invalidate(address, ptep);
 	return pte;
 }
 
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+/*
+ * The batched pte unmap code uses ptep_get_and_clear_full to clear the
+ * ptes. Here an optimization is possible. tlb_gather_mmu flushes all
+ * tlbs of an mm if it can guarantee that the ptes of the mm_struct
+ * cannot be accessed while the batched unmap is running. In this case
+ * full==1 and a simple pte_clear is enough. See tlb.h.
+ */
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+					    unsigned long addr,
+					    pte_t *ptep, int full)
 {
-	pte_t old_pte = *ptep;
-	set_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
+	pte_t pte = *ptep;
+
+	if (full)
+		pte_clear(mm, addr, ptep);
+	else
+		ptep_invalidate(addr, ptep);
+	return pte;
 }
 
 static inline void
@@ -649,6 +685,17 @@ ptep_establish(struct vm_area_struct *vma,
 	set_pte(ptep, entry);
 }
 
+#define ptep_set_wrprotect(__mm, __addr, __ptep)			\
+({									\
+	pte_t __pte = *(__ptep);					\
+	if (pte_write(__pte)) {						\
+		if (atomic_read(&(__mm)->mm_users) > 1 ||		\
+		    (__mm) != current->active_mm)			\
+			ptep_invalidate(__addr, __ptep);		\
+		set_pte_at(__mm, __addr, __ptep, pte_wrprotect(__pte));	\
+	}								\
+})
+
 #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
 	ptep_establish(__vma, __address, __ptep, __entry)
 
@@ -979,6 +1026,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
diff --git a/include/asm-s390/tlb.h b/include/asm-s390/tlb.h
index 51bd957..5662b3a 100644
--- a/include/asm-s390/tlb.h
+++ b/include/asm-s390/tlb.h
@@ -2,19 +2,130 @@
 #define _S390_TLB_H
 
 /*
- * s390 doesn't need any special per-pte or
- * per-vma handling..
+ * TLB flushing on s390 is complicated. The following requirement
+ * from the principles of operation is the most arduous:
+ *
+ * "A valid table entry must not be changed while it is attached
+ * to any CPU and may be used for translation by that CPU except to
+ * (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY,
+ * or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page
+ * table entry, or (3) make a change by means of a COMPARE AND SWAP
+ * AND PURGE instruction that purges the TLB."
+ *
+ * The modification of a pte of an active mm struct therefore is
+ * a two step process: i) invalidate the pte, ii) store the new pte.
+ * This is true for the page protection bit as well.
+ * The only possible optimization is to flush at the beginning of
+ * a tlb_gather_mmu cycle if the mm_struct is currently not in use.
+ *
+ * Pages used for the page tables is a different story. FIXME: more
  */
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <asm/processor.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+
+#define FREE_PTE_NR     506
+
+#ifndef CONFIG_SMP
+#define TLB_NR_PTRS	1
+#else
+#define TLB_NR_PTRS	508
+#endif
+
+struct mmu_gather {
+	struct mm_struct *mm;
+	unsigned int fullmm;
+	unsigned int nr_ptes;
+	unsigned int nr_pmds;
+	void *array[TLB_NR_PTRS];
+};
+
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
+						unsigned int full_mm_flush)
+{
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+
+	tlb->mm = mm;
+	tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) ||
+		(atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm);
+	tlb->nr_ptes = 0;
+	tlb->nr_pmds = TLB_NR_PTRS;
+	if (tlb->fullmm)
+		__tlb_flush_mm(mm);
+	return tlb;
+}
+
+static inline void tlb_flush_mmu(struct mmu_gather *tlb,
+				 unsigned long start, unsigned long end)
+{
+	if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pmds < TLB_NR_PTRS))
+		__tlb_flush_mm(tlb->mm);
+	while (tlb->nr_ptes > 0)
+		pte_free(tlb->array[--tlb->nr_ptes]);
+	while (tlb->nr_pmds < TLB_NR_PTRS)
+		pmd_free((pmd_t *) tlb->array[tlb->nr_pmds++]);
+}
+
+static inline void tlb_finish_mmu(struct mmu_gather *tlb,
+				  unsigned long start, unsigned long end)
+{
+	tlb_flush_mmu(tlb, start, end);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
+}
 
 /*
- * .. because we flush the whole mm when it
- * fills up.
+ * Release the page cache reference for a pte removed by
+ * tlb_ptep_clear_flush. In both flush modes the tlb fo a page cache page
+ * has already been freed, so just do free_page_and_swap_cache.
  */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+	free_page_and_swap_cache(page);
+}
 
-#include <asm-generic/tlb.h>
+/*
+ * pte_free_tlb frees a pte table and clears the CRSTE for the
+ * page table from the tlb.
+ */
+static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page)
+{
+	if (!tlb->fullmm) {
+		tlb->array[tlb->nr_ptes++] = page;
+		if (tlb->nr_ptes >= tlb->nr_pmds)
+			tlb_flush_mmu(tlb, 0, 0);
+	} else
+		pte_free(page);
+}
 
+/*
+ * pmd_free_tlb frees a pmd table and clears the CRSTE for the
+ * segment table entry from the tlb.
+ */
+static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+#ifdef __s390x__
+	if (!tlb->fullmm) {
+		tlb->array[--tlb->nr_pmds] = (struct page *) pmd;
+		if (tlb->nr_ptes >= tlb->nr_pmds)
+			tlb_flush_mmu(tlb, 0, 0);
+	} else
+		pmd_free(pmd);
 #endif
+}
+
+#define tlb_start_vma(tlb, vma)			do { } while (0)
+#define tlb_end_vma(tlb, vma)			do { } while (0)
+#define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
+#define tlb_migrate_finish(mm)			do { } while (0)
+
+#endif /* _S390_TLB_H */
diff --git a/include/asm-s390/tlbflush.h b/include/asm-s390/tlbflush.h
index 73cd85b..81d0240 100644
--- a/include/asm-s390/tlbflush.h
+++ b/include/asm-s390/tlbflush.h
@@ -5,109 +5,49 @@
 #include <asm/processor.h>
 
 /*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs 
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
- */
-
-/*
- * S/390 has three ways of flushing TLBs
- * 'ptlb' does a flush of the local processor
- * 'csp' flushes the TLBs on all PUs of a SMP
- * 'ipte' invalidates a pte in a page table and flushes that out of
- * the TLBs of all PUs of a SMP
- */
-
-#define local_flush_tlb() \
-do {  __asm__ __volatile__("ptlb": : :"memory"); } while (0)
-
-#ifndef CONFIG_SMP
-
-/*
- * We always need to flush, since s390 does not flush tlb
- * on each context switch
+ * Flush all tlb entries on the local cpu.
  */
-
-static inline void flush_tlb(void)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_all(void)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_mm(struct mm_struct *mm) 
+static inline void local_flush_tlb(void)
 {
-	local_flush_tlb();
+	asm volatile("ptlb" : : : "memory");
 }
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	local_flush_tlb();
-}
-
-#define flush_tlb_kernel_range(start, end) \
-	local_flush_tlb();
-
-#else
 
-#include <asm/smp.h>
-
-extern void smp_ptlb_all(void);
+#ifdef CONFIG_SMP
+/*
+ * Flush all tlb entries on all cpus.
+ */
+void smp_ptlb_all(void);
 
 static inline void global_flush_tlb(void)
 {
+	register unsigned long reg2 asm("2");
+	register unsigned long reg3 asm("3");
+	register unsigned long reg4 asm("4");
+	long dummy;
+
 #ifndef __s390x__
 	if (!MACHINE_HAS_CSP) {
 		smp_ptlb_all();
 		return;
 	}
 #endif /* __s390x__ */
-	{
-		register unsigned long addr asm("4");
-		long dummy;
-
-		dummy = 0;
-		addr = ((unsigned long) &dummy) + 1;
-		__asm__ __volatile__ (
-			"    slr  2,2\n"
-			"    slr  3,3\n"
-			"    csp  2,%0"
-			: : "a" (addr), "m" (dummy) : "cc", "2", "3" );
-	}
-}
 
-/*
- * We only have to do global flush of tlb if process run since last
- * flush on any other pu than current. 
- * If we have threads (mm->count > 1) we always do a global flush, 
- * since the process runs on more than one processor at the same time.
- */
+	dummy = 0;
+	reg2 = reg3 = 0;
+	reg4 = ((unsigned long) &dummy) + 1;
+	asm volatile(
+		"	csp	%0,%2"
+		: : "d" (reg2), "d" (reg3), "d" (reg4), "m" (dummy) : "cc" );
+}
 
-static inline void __flush_tlb_mm(struct mm_struct * mm)
+static inline void __tlb_flush_full(struct mm_struct *mm)
 {
 	cpumask_t local_cpumask;
 
-	if (unlikely(cpus_empty(mm->cpu_vm_mask)))
-		return;
-	if (MACHINE_HAS_IDTE) {
-		asm volatile (".insn rrf,0xb98e0000,0,%0,%1,0"
-			      : : "a" (2048),
-			      "a" (__pa(mm->pgd)&PAGE_MASK) : "cc" );
-		return;
-	}
 	preempt_disable();
+	/*
+	 * If the process only ran on the local cpu, do a local flush.
+	 */
 	local_cpumask = cpumask_of_cpu(smp_processor_id());
 	if (cpus_equal(mm->cpu_vm_mask, local_cpumask))
 		local_flush_tlb();
@@ -115,38 +55,86 @@ static inline void __flush_tlb_mm(struct mm_struct * mm)
 		global_flush_tlb();
 	preempt_enable();
 }
+#else
+#define __tlb_flush_full(mm)	__tlb_flush_local()
+#endif
 
-static inline void flush_tlb(void)
+/*
+ * Flush all tlb entries of a page table on all cpus.
+ */
+static inline void __tlb_flush_idte(pgd_t *pgd)
 {
-	__flush_tlb_mm(current->mm);
+	asm volatile(
+		"	.insn	rrf,0xb98e0000,0,%0,%1,0"
+		: : "a" (2048), "a" (__pa(pgd) & PAGE_MASK) : "cc" );
 }
-static inline void flush_tlb_all(void)
+
+static inline void __tlb_flush_mm(struct mm_struct * mm)
 {
-	global_flush_tlb();
+	if (unlikely(cpus_empty(mm->cpu_vm_mask)))
+		return;
+	/*
+	 * If the machine has IDTE we prefer to do a per mm flush
+	 * on all cpus instead of doing a local flush if the mm
+	 * only ran on the local cpu.
+	 */
+	if (MACHINE_HAS_IDTE) {
+		__tlb_flush_idte(mm->pgd);
+		return;
+	}
+	__tlb_flush_full(mm);
+}
+
+static inline void __tlb_flush_mm_cond(struct mm_struct * mm)
+{
+	if (atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm)
+		__tlb_flush_mm(mm);
 }
-static inline void flush_tlb_mm(struct mm_struct *mm) 
+
+static inline void flush_tlb_pgtables(struct mm_struct *mm,
+                                      unsigned long start, unsigned long end)
 {
-	__flush_tlb_mm(mm); 
+        /* S/390 does not keep any page table caches in TLB */
 }
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
+
+/*
+ * TLB flushing:
+ *  flush_tlb() - flushes the current mm struct TLBs
+ *  flush_tlb_all() - flushes all processes TLBs
+ *  flush_tlb_mm(mm) - flushes the specified mm context TLB's
+ *  flush_tlb_page(vma, vmaddr) - flushes one page
+ *  flush_tlb_range(vma, start, end) - flushes a range of pages
+ *  flush_tlb_kernel_range(start, end) - flushes a range of kernel pages
+ *  flush_tlb_pgtables(mm, start, end) flushes a range of page tables
+ */
+
+/*
+ * flush_tlb_mm goes together with ptep_set_wrprotect for the
+ * copy_page_range operation and flush_tlb_range is related to
+ * ptep_get_and_clear for change_protection. ptep_set_wrprotect and
+ * ptep_get_and_clear do not flush the TLBs directly if the mm has
+ * only one user. At the end of the update the flush_tlb_mm and
+ * flush_tlb_range functions need to do the flush.
+ */
+#define flush_tlb()				do { } while (0)
+#define flush_tlb_all()				do { } while (0)
+#define flush_tlb_page(vma, addr)		do { } while (0)
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
 {
-	__flush_tlb_mm(vma->vm_mm);
+	__tlb_flush_mm_cond(mm);
 }
+
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {
-	__flush_tlb_mm(vma->vm_mm); 
+	__tlb_flush_mm_cond(vma->vm_mm);
 }
 
-#define flush_tlb_kernel_range(start, end) global_flush_tlb()
-
-#endif
-
-static inline void flush_tlb_pgtables(struct mm_struct *mm,
-                                      unsigned long start, unsigned long end)
+static inline void flush_tlb_kernel_range(unsigned long start,
+					  unsigned long end)
 {
-        /* S/390 does not keep any page table caches in TLB */
+	__tlb_flush_mm(&init_mm);
 }
 
 #endif /* _S390_TLBFLUSH_H */