Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4312

kernel-2.6.18-194.11.1.el5.src.rpm

From: Gerd Hoffmann <kraxel@redhat.com>
Subject: [RHEL5 PATCHES] xen kdump/kexec 1/3: backports
Date: Thu, 07 Jun 2007 12:42:13 +0200
Bugzilla: 212843
Message-Id: <4667E105.1070201@redhat.com>
Changelog: [xen] kdump/kexec support

First set of patches in preparation of xen kexec/kdump (dom0, i.e.
physical machine) support.  These are all mainline backports.  Detailed
description is in each individual patch file.

This is a repost.  Changes:
  * Bugzilla number updated (lots of dups for that one).
  * added one patch (kexec-reloc-merge-fixup) which fixes a bug
    introduced by the reloc patches: __pa() macro changes behavior.

CONFIG_KEXEC=y

cheers,
   Gerd

X-Git-Tag: v2.6.19^0~2018^2~54
X-Git-Url: http://localhost/gitweb?p=linux-2.6%2F.git;a=commitdiff_plain;h=3566561bfadffcb5dbc85d576be80c0dbf2cccc9

[PATCH] i386: Avoid overwriting the current pgd (V4, i386)

kexec: Avoid overwriting the current pgd (V4, i386)

This patch upgrades the i386-specific kexec code to avoid overwriting the
current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
to start a secondary kernel that dumps the memory of the previous kernel.

The code introduces a new set of page tables. These tables are used to provide
an executable identity mapping without overwriting the current pgd.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/machine_kexec.c   |  115 +++++---------------------
 arch/i386/kernel/relocate_kernel.S |  162 +++++++++++++++++++++++++++++++++----
 include/asm-i386/kexec.h           |   27 ++++++
 3 files changed, 200 insertions(+), 104 deletions(-)

Index: linux-2.6.18.noarch/arch/i386/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/i386/kernel/machine_kexec.c
@@ -20,70 +20,13 @@
 #include <asm/system.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L2_ATTR (_PAGE_PRESENT)
-
-#define LEVEL0_SIZE (1UL << 12UL)
-
-#ifndef CONFIG_X86_PAE
-#define LEVEL1_SIZE (1UL << 22UL)
-static u32 pgtable_level1[1024] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
-	unsigned long level1_index, level2_index;
-	u32 *pgtable_level2;
-
-	/* Find the current page table */
-	pgtable_level2 = __va(read_cr3());
-
-	/* Find the indexes of the physical address to identity map */
-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
-	level2_index = address / LEVEL1_SIZE;
-
-	/* Identity map the page table entry */
-	pgtable_level1[level1_index] = address | L0_ATTR;
-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-
-	/* Flush the tlb so the new mapping takes effect.
-	 * Global tlb entries are not flushed but that is not an issue.
-	 */
-	load_cr3(pgtable_level2);
-}
-
-#else
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-static u64 pgtable_level1[512] PAGE_ALIGNED;
-static u64 pgtable_level2[512] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
-	unsigned long level1_index, level2_index, level3_index;
-	u64 *pgtable_level3;
-
-	/* Find the current page table */
-	pgtable_level3 = __va(read_cr3());
-
-	/* Find the indexes of the physical address to identity map */
-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
-	level3_index = address / LEVEL2_SIZE;
-
-	/* Identity map the page table entry */
-	pgtable_level1[level1_index] = address | L0_ATTR;
-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-	set_64bit(&pgtable_level3[level3_index],
-					       __pa(pgtable_level2) | L2_ATTR);
-
-	/* Flush the tlb so the new mapping takes effect.
-	 * Global tlb entries are not flushed but that is not an issue.
-	 */
-	load_cr3(pgtable_level3);
-}
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
 #endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -127,16 +70,6 @@ static void load_segments(void)
 #undef __STR
 }
 
-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
-					unsigned long indirection_page,
-					unsigned long reboot_code_buffer,
-					unsigned long start_address,
-					unsigned int has_pae) ATTRIB_NORET;
-
-extern const unsigned char relocate_new_kernel[];
-extern void relocate_new_kernel_end(void);
-extern const unsigned int relocate_new_kernel_size;
-
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -169,25 +102,29 @@ void machine_kexec_cleanup(struct kimage
  */
 NORET_TYPE void machine_kexec(struct kimage *image)
 {
-	unsigned long page_list;
-	unsigned long reboot_code_buffer;
-
-	relocate_new_kernel_t rnk;
+	unsigned long page_list[PAGES_NR];
+	void *control_page;
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
-	/* Compute some offsets */
-	reboot_code_buffer = page_to_pfn(image->control_code_page)
-								<< PAGE_SHIFT;
-	page_list = image->head;
-
-	/* Set up an identity mapping for the reboot_code_buffer */
-	identity_map_page(reboot_code_buffer);
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
 
-	/* copy it out */
-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
-						relocate_new_kernel_size);
+	page_list[PA_CONTROL_PAGE] = __pa(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
 	/* The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -206,6 +143,6 @@ NORET_TYPE void machine_kexec(struct kim
 	set_idt(phys_to_virt(0),0);
 
 	/* now call it */
-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
+	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+			image->start, cpu_has_pae);
 }
Index: linux-2.6.18.noarch/arch/i386/kernel/relocate_kernel.S
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/relocate_kernel.S
+++ linux-2.6.18.noarch/arch/i386/kernel/relocate_kernel.S
@@ -7,16 +7,138 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+
+	.text
+	.align PAGE_ALIGNED
+	.globl relocate_kernel
+relocate_kernel:
+	movl	8(%esp), %ebp /* list of pages */
+
+#ifdef CONFIG_X86_PAE
+	/* map the control page at its virtual address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xc0000000, %eax
+	shrl	$27, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PMD_0)(%ebp), %edx
+	orl	$PAE_PGD_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PMD_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x3fe00000, %eax
+	shrl	$18, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_0)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x001ff000, %eax
+	shrl	$9, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	/* identity map the control page at its physical address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xc0000000, %eax
+	shrl	$27, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PMD_1)(%ebp), %edx
+	orl	$PAE_PGD_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PMD_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x3fe00000, %eax
+	shrl	$18, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_1)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x001ff000, %eax
+	shrl	$9, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+#else
+	/* map the control page at its virtual address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xffc00000, %eax
+	shrl	$20, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_0)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x003ff000, %eax
+	shrl	$10, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	/* identity map the control page at its physical address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xffc00000, %eax
+	shrl	$20, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_1)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x003ff000, %eax
+	shrl	$10, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+#endif
 
-	/*
-	 * Must be relocatable PIC code callable as a C function, that once
-	 * it starts can not use the previous processes stack.
-	 */
-	.globl relocate_new_kernel
 relocate_new_kernel:
 	/* read the arguments and say goodbye to the stack */
 	movl  4(%esp), %ebx /* page_list */
-	movl  8(%esp), %ebp /* reboot_code_buffer */
+	movl  8(%esp), %ebp /* list of pages */
 	movl  12(%esp), %edx /* start address */
 	movl  16(%esp), %ecx /* cpu_has_pae */
 
@@ -24,11 +146,26 @@ relocate_new_kernel:
 	pushl $0
 	popfl
 
-	/* set a new stack at the bottom of our page... */
-	lea   4096(%ebp), %esp
+	/* get physical address of control page now */
+	/* this is impossible after page table switch */
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
-	/* store the parameters back on the stack */
-	pushl   %edx /* store the start address */
+	/* switch to new set of page tables */
+	movl	PTR(PA_PGD)(%ebp), %eax
+	movl	%eax, %cr3
+
+	/* setup a new stack at the end of the physical control page */
+	lea	4096(%edi), %esp
+
+	/* jump to identity mapped page */
+	movl    %edi, %eax
+	addl    $(identity_mapped - relocate_kernel), %eax
+	pushl   %eax
+	ret
+
+identity_mapped:
+	/* store the start address on the stack */
+	pushl   %edx
 
 	/* Set cr0 to a known state:
 	 * 31 0 == Paging disabled
@@ -113,8 +250,3 @@ relocate_new_kernel:
 	xorl    %edi, %edi
 	xorl    %ebp, %ebp
 	ret
-relocate_new_kernel_end:
-
-	.globl relocate_new_kernel_size
-relocate_new_kernel_size:
-	.long relocate_new_kernel_end - relocate_new_kernel
Index: linux-2.6.18.noarch/include/asm-i386/kexec.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-i386/kexec.h
+++ linux-2.6.18.noarch/include/asm-i386/kexec.h
@@ -1,6 +1,26 @@
 #ifndef _I386_KEXEC_H
 #define _I386_KEXEC_H
 
+#define PA_CONTROL_PAGE  0
+#define VA_CONTROL_PAGE  1
+#define PA_PGD           2
+#define VA_PGD           3
+#define PA_PTE_0         4
+#define VA_PTE_0         5
+#define PA_PTE_1         6
+#define VA_PTE_1         7
+#ifdef CONFIG_X86_PAE
+#define PA_PMD_0         8
+#define VA_PMD_0         9
+#define PA_PMD_1         10
+#define VA_PMD_1         11
+#define PAGES_NR         12
+#else
+#define PAGES_NR         8
+#endif
+
+#ifndef __ASSEMBLY__
+
 #include <asm/fixmap.h>
 #include <asm/ptrace.h>
 #include <asm/string.h>
@@ -72,5 +92,12 @@ static inline void crash_setup_regs(stru
                newregs->eip = (unsigned long)current_text_addr();
        }
 }
+asmlinkage NORET_TYPE void
+relocate_kernel(unsigned long indirection_page,
+		unsigned long control_page,
+		unsigned long start_address,
+		unsigned int has_pae) ATTRIB_NORET;
+
+#endif /* __ASSEMBLY__ */
 
 #endif /* _I386_KEXEC_H */


X-Git-Tag: v2.6.19^0~2018^2~55
X-Git-Url: http://localhost/gitweb?p=linux-2.6%2F.git;a=commitdiff_plain;h=4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f

[PATCH] Avoid overwriting the current pgd (V4, x86_64)

kexec: Avoid overwriting the current pgd (V4, x86_64)

This patch upgrades the x86_64-specific kexec code to avoid overwriting the
current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
to start a secondary kernel that dumps the memory of the previous kernel.

The code introduces a new set of page tables. These tables are used to provide
an executable identity mapping without overwriting the current pgd.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/machine_kexec.c   |   67 +++++++------
 arch/x86_64/kernel/relocate_kernel.S |  171 +++++++++++++++++++++++++++++++----
 include/asm-x86_64/kexec.h           |   29 +++++
 3 files changed, 216 insertions(+), 51 deletions(-)

Index: linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
@@ -15,6 +15,15 @@
 #include <asm/mmu_context.h>
 #include <asm/io.h>
 
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u64 kexec_pgd[512] PAGE_ALIGNED;
+static u64 kexec_pud0[512] PAGE_ALIGNED;
+static u64 kexec_pmd0[512] PAGE_ALIGNED;
+static u64 kexec_pte0[512] PAGE_ALIGNED;
+static u64 kexec_pud1[512] PAGE_ALIGNED;
+static u64 kexec_pmd1[512] PAGE_ALIGNED;
+static u64 kexec_pte1[512] PAGE_ALIGNED;
+
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
@@ -144,32 +153,19 @@ static void load_segments(void)
 		);
 }
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
-					unsigned long control_code_buffer,
-					unsigned long start_address,
-					unsigned long pgtable) ATTRIB_NORET;
-
-extern const unsigned char relocate_new_kernel[];
-extern const unsigned long relocate_new_kernel_size;
-
 int machine_kexec_prepare(struct kimage *image)
 {
-	unsigned long start_pgtable, control_code_buffer;
+	unsigned long start_pgtable;
 	int result;
 
 	/* Calculate the offsets */
 	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
-	control_code_buffer = start_pgtable + PAGE_SIZE;
 
 	/* Setup the identity mapped 64bit page table */
 	result = init_pgtable(image, start_pgtable);
 	if (result)
 		return result;
 
-	/* Place the code in the reboot code buffer */
-	memcpy(__va(control_code_buffer), relocate_new_kernel,
-						relocate_new_kernel_size);
-
 	return 0;
 }
 
@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage
  */
 NORET_TYPE void machine_kexec(struct kimage *image)
 {
-	unsigned long page_list;
-	unsigned long control_code_buffer;
-	unsigned long start_pgtable;
-	relocate_new_kernel_t rnk;
+	unsigned long page_list[PAGES_NR];
+	void *control_page;
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
-	/* Calculate the offsets */
-	page_list = image->head;
-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
-	control_code_buffer = start_pgtable + PAGE_SIZE;
+	control_page = page_address(image->control_code_page) + PAGE_SIZE;
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
 
-	/* Set the low half of the page table to my identity mapped
-	 * page table for kexec.  Leave the high half pointing at the
-	 * kernel pages.   Don't bother to flush the global pages
-	 * as that will happen when I fully switch to my identity mapped
-	 * page table anyway.
-	 */
-	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
-	__flush_tlb();
+	page_list[PA_CONTROL_PAGE] = __pa(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[VA_PGD] = (unsigned long)kexec_pgd;
+	page_list[PA_PUD_0] = __pa(kexec_pud0);
+	page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
+	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+	page_list[PA_PUD_1] = __pa(kexec_pud1);
+	page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
+	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
+	page_list[PA_TABLE_PAGE] =
+	  (unsigned long)__pa(page_address(image->control_code_page));
 
 	/* The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -222,7 +224,8 @@ NORET_TYPE void machine_kexec(struct kim
 	 */
 	set_gdt(phys_to_virt(0),0);
 	set_idt(phys_to_virt(0),0);
+
 	/* now call it */
-	rnk = (relocate_new_kernel_t) control_code_buffer;
-	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+			image->start);
 }
Index: linux-2.6.18.noarch/arch/x86_64/kernel/relocate_kernel.S
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/relocate_kernel.S
+++ linux-2.6.18.noarch/arch/x86_64/kernel/relocate_kernel.S
@@ -7,31 +7,169 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
 
-	/*
-	 * Must be relocatable PIC code callable as a C function, that once
-	 * it starts can not use the previous processes stack.
-	 */
-	.globl relocate_new_kernel
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 3)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+
+	.text
+	.align PAGE_ALIGNED
 	.code64
+	.globl relocate_kernel
+relocate_kernel:
+	/* %rdi indirection_page
+	 * %rsi page_list
+	 * %rdx start address
+	 */
+
+	/* map the control page at its virtual address */
+
+	movq	$0x0000ff8000000000, %r10        /* mask */
+	mov	$(39 - 3), %cl                   /* bits to shift */
+	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PGD)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_PUD_0)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	shrq	$9, %r10
+	sub	$9, %cl
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PUD_0)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_PMD_0)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	shrq	$9, %r10
+	sub	$9, %cl
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PMD_0)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_PTE_0)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	shrq	$9, %r10
+	sub	$9, %cl
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PTE_0)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	/* identity map the control page at its physical address */
+
+	movq	$0x0000ff8000000000, %r10        /* mask */
+	mov	$(39 - 3), %cl                   /* bits to shift */
+	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PGD)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_PUD_1)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	shrq	$9, %r10
+	sub	$9, %cl
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PUD_1)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_PMD_1)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	shrq	$9, %r10
+	sub	$9, %cl
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PMD_1)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_PTE_1)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
+	shrq	$9, %r10
+	sub	$9, %cl
+
+	movq	%r11, %r9
+	andq	%r10, %r9
+	shrq	%cl, %r9
+
+	movq	PTR(VA_PTE_1)(%rsi), %r8
+	addq	%r8, %r9
+	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
+	orq	$PAGE_ATTR, %r8
+	movq	%r8, (%r9)
+
 relocate_new_kernel:
-	/* %rdi page_list
-	 * %rsi reboot_code_buffer
+	/* %rdi indirection_page
+	 * %rsi page_list
 	 * %rdx start address
-	 * %rcx page_table
-	 * %r8  arg5
-	 * %r9  arg6
 	 */
 
 	/* zero out flags, and disable interrupts */
 	pushq $0
 	popfq
 
-	/* set a new stack at the bottom of our page... */
-	lea   4096(%rsi), %rsp
+	/* get physical address of control page now */
+	/* this is impossible after page table switch */
+	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
-	/* store the parameters back on the stack */
-	pushq	%rdx /* store the start address */
+	/* get physical address of page table now too */
+	movq	PTR(PA_TABLE_PAGE)(%rsi), %rcx
+
+	/* switch to new set of page tables */
+	movq	PTR(PA_PGD)(%rsi), %r9
+	movq	%r9, %cr3
+
+	/* setup a new stack at the end of the physical control page */
+	lea	4096(%r8), %rsp
+
+	/* jump to identity mapped page */
+	addq	$(identity_mapped - relocate_kernel), %r8
+	pushq	%r8
+	ret
+
+identity_mapped:
+	/* store the start address on the stack */
+	pushq   %rdx
 
 	/* Set cr0 to a known state:
 	 * 31 1 == Paging enabled
@@ -136,8 +274,3 @@ relocate_new_kernel:
 	xorq	%r15, %r15
 
 	ret
-relocate_new_kernel_end:
-
-	.globl relocate_new_kernel_size
-relocate_new_kernel_size:
-	.quad relocate_new_kernel_end - relocate_new_kernel
Index: linux-2.6.18.noarch/include/asm-x86_64/kexec.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-x86_64/kexec.h
+++ linux-2.6.18.noarch/include/asm-x86_64/kexec.h
@@ -1,6 +1,27 @@
 #ifndef _X86_64_KEXEC_H
 #define _X86_64_KEXEC_H
 
+#define PA_CONTROL_PAGE  0
+#define VA_CONTROL_PAGE  1
+#define PA_PGD           2
+#define VA_PGD           3
+#define PA_PUD_0         4
+#define VA_PUD_0         5
+#define PA_PMD_0         6
+#define VA_PMD_0         7
+#define PA_PTE_0         8
+#define VA_PTE_0         9
+#define PA_PUD_1         10
+#define VA_PUD_1         11
+#define PA_PMD_1         12
+#define VA_PMD_1         13
+#define PA_PTE_1         14
+#define VA_PTE_1         15
+#define PA_TABLE_PAGE    16
+#define PAGES_NR         17
+
+#ifndef __ASSEMBLY__
+
 #include <linux/string.h>
 
 #include <asm/page.h>
@@ -64,4 +85,12 @@ static inline void crash_setup_regs(stru
 		newregs->rip = (unsigned long)current_text_addr();
 	}
 }
+
+NORET_TYPE void
+relocate_kernel(unsigned long indirection_page,
+		unsigned long page_list,
+		unsigned long start_address) ATTRIB_NORET;
+
+#endif /* __ASSEMBLY__ */
+
 #endif /* _X86_64_KEXEC_H */


X-Git-Tag: v2.6.19^0~417^2~1
X-Git-Url: http://localhost/gitweb?p=linux-2.6%2F.git;a=commitdiff_plain;h=dbaab49f92ff6ae6255762a948375e4036cbdbd2

[PATCH] x86-64: Overlapping program headers in physical addr space fix

o A recent change to vmlinux.ld.S file broke kexec as now resulting vmlinux
  program headers are overlapping in physical address space.

o Now all the vsyscall related sections are placed after data and after
  that mostly init data sections are placed. To avoid physical overlap
  among phdrs, there are three possible solutions.
	- Place vsyscall sections also in data phdrs instead of user
	- move vsyscal sections after init data in bss.
	- create another phdrs say data.init and move all the sections
	  after vsyscall into this new phdr.

o This patch implements the third solution.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Magnus Damm <magnus@valinux.co.jp>
Cc: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/vmlinux.lds.S |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux-2.6.18.noarch/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/vmlinux.lds.S
+++ linux-2.6.18.noarch/arch/x86_64/kernel/vmlinux.lds.S
@@ -17,6 +17,7 @@ PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
 	data PT_LOAD FLAGS(7);	/* RWE */
 	user PT_LOAD FLAGS(7);	/* RWE */
+	data.init PT_LOAD FLAGS(7);	/* RWE */
 	note PT_NOTE FLAGS(4);	/* R__ */
 }
 SECTIONS
@@ -139,7 +140,7 @@ SECTIONS
   . = ALIGN(8192);		/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 	*(.data.init_task)
-  } :data
+  }:data.init
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {


X-Git-Tag: v2.6.19-rc6^0~63
X-Git-Url: http://localhost/gitweb?p=linux-2.6%2F.git;a=commitdiff_plain;h=c06cb8b1c4d25e5b4d7a2d7c2462619de1e0dbc4

[PATCH] i386: Force data segment to be 4K aligned

o Currently there is no specific alignment restriction in linker script
  and in some cases it can be placed non 4K aligned addresses. This fails
  kexec which checks that segment to be loaded is page aligned.

o I guess, it does not harm data segment to be 4K aligned.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vmlinux.lds.S |    1 +
 1 file changed, 1 insertion(+)

Index: linux-2.6.18.noarch/arch/i386/kernel/vmlinux.lds.S
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/vmlinux.lds.S
+++ linux-2.6.18.noarch/arch/i386/kernel/vmlinux.lds.S
@@ -64,6 +64,7 @@ SECTIONS
   }
 
   /* writeable */
+  . = ALIGN(4096);
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
 	*(.data)
 	CONSTRUCTORS


X-Git-Tag: v2.6.22-rc1^0~1011^2~149
X-Git-Url: http://localhost/gitweb?p=linux-2.6%2F.git;a=commitdiff_plain;h=79e030114a8d97a1dcd593ab84fb986f8c91c536

[PATCH] i386: Allow i386 crash kernels to handle x86_64 dumps

The specific case I am encountering is kdump under Xen with a 64 bit
hypervisor and 32 bit kernel/userspace.  The dump created is 64 bit due to
the hypervisor but the dump kernel is 32 bit for maximum compatibility.

It's possibly less likely to be useful in a purely native scenario but I
see no reason to disallow it.

[akpm@linux-foundation.org: build fix]

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Horms <horms@verge.net.au>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/vmcore.c           |    2 +-
 include/asm-i386/kexec.h   |    3 +++
 include/linux/crash_dump.h |    8 ++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

Index: linux-2.6.18.noarch/fs/proc/vmcore.c
===================================================================
--- linux-2.6.18.noarch.orig/fs/proc/vmcore.c
+++ linux-2.6.18.noarch/fs/proc/vmcore.c
@@ -514,7 +514,7 @@ static int __init parse_crash_elf64_head
 	/* Do some basic Verification. */
 	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
 		(ehdr.e_type != ET_CORE) ||
-		!elf_check_arch(&ehdr) ||
+		!vmcore_elf_check_arch(&ehdr) ||
 		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
 		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
 		ehdr.e_version != EV_CURRENT ||
Index: linux-2.6.18.noarch/include/asm-i386/kexec.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-i386/kexec.h
+++ linux-2.6.18.noarch/include/asm-i386/kexec.h
@@ -47,6 +47,9 @@
 /* The native architecture */
 #define KEXEC_ARCH KEXEC_ARCH_386
 
+/* We can also handle crash dumps from 64 bit kernel. */
+#define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
+
 #define MAX_NOTE_BYTES 1024
 
 /* CPU does not save ss and esp on stack if execution is already
Index: linux-2.6.18.noarch/include/linux/crash_dump.h
===================================================================
--- linux-2.6.18.noarch.orig/include/linux/crash_dump.h
+++ linux-2.6.18.noarch/include/linux/crash_dump.h
@@ -14,5 +14,13 @@ extern ssize_t copy_oldmem_page(unsigned
 extern const struct file_operations proc_vmcore_operations;
 extern struct proc_dir_entry *proc_vmcore;
 
+/* Architecture code defines this if there are other possible ELF
+ * machine types, e.g. on bi-arch capable hardware. */
+#ifndef vmcore_elf_check_arch_cross
+#define vmcore_elf_check_arch_cross(x) 0
+#endif
+
+#define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
+
 #endif /* CONFIG_CRASH_DUMP */
 #endif /* LINUX_CRASHDUMP_H */


---
 arch/x86_64/kernel/machine_kexec.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

Index: linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
@@ -255,19 +255,19 @@ NORET_TYPE void machine_kexec(struct kim
 
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
 	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
-	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[PA_PGD] = __pa_symbol(&kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
-	page_list[PA_PUD_0] = __pa(kexec_pud0);
+	page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0);
 	page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
-	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0);
 	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0);
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-	page_list[PA_PUD_1] = __pa(kexec_pud1);
+	page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1);
 	page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
-	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1);
 	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
 	page_list[PA_TABLE_PAGE] =

Second set of patches in preparation of xen kexec/kdump (dom0, i.e.
physical machine) support.  These two went over lkml, but where rejected
with "we'll do that when xen dom0 support goes mainline".  One is for
i386, one for x86_64, descriptions are on top of the actual patches.

Note: not mainline -- kraxel
URL: http://lkml.org/lkml/2006/12/5/99

From	Magnus Damm
Date	Tue, 05 Dec 2006 22:38:02 +0900
Subject	[PATCH 01/02] kexec: Move segment code to assembly file (i386)

kexec: Move segment code to assembly file (i386)

This patch moves the idt, gdt, and segment handling code from
machine_kexec.c to relocate_kernel.S. The main reason behind this move
is to avoid code duplication in the Xen hypervisor. With this patch all
code required to kexec is put on the control page.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
---
 arch/i386/kernel/machine_kexec.c   |   59 -------------------------------------
 arch/i386/kernel/relocate_kernel.S |   58 +++++++++++++++++++++++++++++++++---
 2 files changed, 53 insertions(+), 64 deletions(-)

Index: linux-2.6.18.noarch/arch/i386/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/i386/kernel/machine_kexec.c
@@ -28,48 +28,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
 static u32 kexec_pte0[1024] PAGE_ALIGNED;
 static u32 kexec_pte1[1024] PAGE_ALIGNED;
 
-static void set_idt(void *newidt, __u16 limit)
-{
-	struct Xgt_desc_struct curidt;
-
-	/* ia32 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	load_idt(&curidt);
-};
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
-	struct Xgt_desc_struct curgdt;
-
-	/* ia32 supports unaligned loads & stores */
-	curgdt.size    = limit;
-	curgdt.address = (unsigned long)newgdt;
-
-	load_gdt(&curgdt);
-};
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-	__asm__ __volatile__ (
-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
-		"\t1:\n"
-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-		"\tmovl %%eax,%%ds\n"
-		"\tmovl %%eax,%%es\n"
-		"\tmovl %%eax,%%fs\n"
-		"\tmovl %%eax,%%gs\n"
-		"\tmovl %%eax,%%ss\n"
-		::: "eax", "memory");
-#undef STR
-#undef __STR
-}
-
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -126,23 +84,6 @@ NORET_TYPE void machine_kexec(struct kim
 	page_list[PA_PTE_1] = __pa(kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
-	/* The segment registers are funny things, they have both a
-	 * visible and an invisible part.  Whenever the visible part is
-	 * set to a specific selector, the invisible part is loaded
-	 * with from a table in memory.  At no other time is the
-	 * descriptor table in memory accessed.
-	 *
-	 * I take advantage of this here by force loading the
-	 * segments, before I zap the gdt with an invalid value.
-	 */
-	load_segments();
-	/* The gdt & idt are now invalid.
-	 * If you want to load them you must set up your own idt & gdt.
-	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
-
-	/* now call it */
 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
 			image->start, cpu_has_pae);
 }
Index: linux-2.6.18.noarch/arch/i386/kernel/relocate_kernel.S
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/relocate_kernel.S
+++ linux-2.6.18.noarch/arch/i386/kernel/relocate_kernel.S
@@ -154,14 +154,45 @@ relocate_new_kernel:
 	movl	PTR(PA_PGD)(%ebp), %eax
 	movl	%eax, %cr3
 
+	/* setup idt */
+	movl	%edi, %eax
+	addl	$(idt_48 - relocate_kernel), %eax
+	lidtl	(%eax)
+
+	/* setup gdt */
+	movl	%edi, %eax
+	addl	$(gdt - relocate_kernel), %eax
+	movl	%edi, %esi
+	addl	$((gdt_48 - relocate_kernel) + 2), %esi
+	movl	%eax, (%esi)
+
+	movl	%edi, %eax
+	addl	$(gdt_48 - relocate_kernel), %eax
+	lgdtl	(%eax)
+
+	/* setup data segment registers */
+	mov	$(gdt_ds - gdt), %eax
+	mov	%eax, %ds
+	mov	%eax, %es
+	mov	%eax, %fs
+	mov	%eax, %gs
+	mov	%eax, %ss
+
 	/* setup a new stack at the end of the physical control page */
 	lea	4096(%edi), %esp
 
-	/* jump to identity mapped page */
-	movl    %edi, %eax
-	addl    $(identity_mapped - relocate_kernel), %eax
-	pushl   %eax
-	ret
+	/* load new code segment and jump to identity mapped page */
+	movl	%edi, %esi
+	xorl	%eax, %eax
+	pushl	%eax
+	pushl	%esi
+	pushl	%eax
+	movl	$(gdt_cs - gdt), %eax
+	pushl	%eax
+	movl	%edi, %eax
+	addl	$(identity_mapped - relocate_kernel),%eax
+	pushl	%eax
+	iretl
 
 identity_mapped:
 	/* store the start address on the stack */
@@ -250,3 +281,20 @@ identity_mapped:
 	xorl    %edi, %edi
 	xorl    %ebp, %ebp
 	ret
+
+	.align	16
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor */
+gdt_cs:
+	.quad	0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
+gdt_ds:
+	.quad	0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+	.word	gdt_end - gdt - 1	/* limit */
+	.long	0			/* base - filled in by code above */
+
+idt_48:
+	.word	0			/* limit */
+	.long	0			/* base */


Note: not mainline -- kraxel
URL: http://lkml.org/lkml/2006/12/5/98

From	Magnus Damm <>
Date	Tue, 05 Dec 2006 22:38:07 +0900
Subject	[PATCH 02/02] kexec: Move segment code to assembly file (x86_64)

kexec: Move segment code to assembly file (x86_64)

This patch moves the idt, gdt, and segment handling code from
machine_kexec.c to relocate_kernel.S. The main reason behind this move
is to avoid code duplication in the Xen hypervisor. With this patch all
code required to kexec is put on the control page.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
---
 arch/x86_64/kernel/machine_kexec.c   |   58 -----------------------------------
 arch/x86_64/kernel/relocate_kernel.S |   50 +++++++++++++++++++++++++++---
 2 files changed, 45 insertions(+), 63 deletions(-)

Index: linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
@@ -112,47 +112,6 @@ static int init_pgtable(struct kimage *i
  	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
 }
 
-static void set_idt(void *newidt, u16 limit)
-{
-	struct desc_ptr curidt;
-
-	/* x86-64 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	__asm__ __volatile__ (
-		"lidtq %0\n"
-		: : "m" (curidt)
-		);
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-	struct desc_ptr curgdt;
-
-	/* x86-64 supports unaligned loads & stores */
-	curgdt.size    = limit;
-	curgdt.address = (unsigned long)newgdt;
-
-	__asm__ __volatile__ (
-		"lgdtq %0\n"
-		: : "m" (curgdt)
-		);
-};
-
-static void load_segments(void)
-{
-	__asm__ __volatile__ (
-		"\tmovl %0,%%ds\n"
-		"\tmovl %0,%%es\n"
-		"\tmovl %0,%%ss\n"
-		"\tmovl %0,%%fs\n"
-		"\tmovl %0,%%gs\n"
-		: : "a" (__KERNEL_DS) : "memory"
-		);
-}
-
 int machine_kexec_prepare(struct kimage *image)
 {
 	unsigned long start_pgtable;
@@ -209,23 +168,6 @@ NORET_TYPE void machine_kexec(struct kim
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
-	/* The segment registers are funny things, they have both a
-	 * visible and an invisible part.  Whenever the visible part is
-	 * set to a specific selector, the invisible part is loaded
-	 * with from a table in memory.  At no other time is the
-	 * descriptor table in memory accessed.
-	 *
-	 * I take advantage of this here by force loading the
-	 * segments, before I zap the gdt with an invalid value.
-	 */
-	load_segments();
-	/* The gdt & idt are now invalid.
-	 * If you want to load them you must set up your own idt & gdt.
-	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
-
-	/* now call it */
 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
 			image->start);
 }
Index: linux-2.6.18.noarch/arch/x86_64/kernel/relocate_kernel.S
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/relocate_kernel.S
+++ linux-2.6.18.noarch/arch/x86_64/kernel/relocate_kernel.S
@@ -159,13 +159,39 @@ relocate_new_kernel:
 	movq	PTR(PA_PGD)(%rsi), %r9
 	movq	%r9, %cr3
 
+	/* setup idt */
+	movq    %r8, %rax
+	addq    $(idt_80 - relocate_kernel), %rax
+	lidtq   (%rax)
+
+	/* setup gdt */
+	movq    %r8, %rax
+	addq    $(gdt - relocate_kernel), %rax
+	movq    %r8, %r9
+	addq    $((gdt_80 - relocate_kernel) + 2), %r9
+	movq    %rax, (%r9)
+
+	movq    %r8, %rax
+	addq    $(gdt_80 - relocate_kernel), %rax
+	lgdtq   (%rax)
+
+	/* setup data segment registers */
+	xorl	%eax, %eax
+	movl    %eax, %ds
+	movl    %eax, %es
+	movl    %eax, %fs
+	movl    %eax, %gs
+	movl    %eax, %ss
+
 	/* setup a new stack at the end of the physical control page */
 	lea	4096(%r8), %rsp
 
-	/* jump to identity mapped page */
-	addq	$(identity_mapped - relocate_kernel), %r8
-	pushq	%r8
-	ret
+	/* load new code segment and jump to identity mapped page */
+	movq	%r8, %rax
+	addq    $(identity_mapped - relocate_kernel), %rax
+	pushq	$(gdt_cs - gdt)
+	pushq	%rax
+	lretq
 
 identity_mapped:
 	/* store the start address on the stack */
@@ -272,5 +298,19 @@ identity_mapped:
 	xorq	%r13, %r13
 	xorq	%r14, %r14
 	xorq	%r15, %r15
-
 	ret
+
+	.align  16
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor */
+gdt_cs:
+	.quad   0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+	.word	gdt_end - gdt - 1	/* limit */
+	.quad	0			/* base - filled in by code above */
+
+idt_80:
+	.word	0			/* limit */
+	.quad	0			/* base */


>Now the actual xen kexec/kdump (dom0, i.e. physical machine) 
>implementation, pulled from xen-unstable mercurial and backported to 
>RHEL-5 kernel.

Updated patch (kexec-combined-2), added one missing mercurial changeset
(12766-Remove_unused_KEXEC_RANGE_VA_XEN), updated patch comment accordingly.

This is a repost with updated bugzilla number and testing status.

Tested with all four combinations out of i386/x86_64 and xen/baremetal.

Note for x86_64/baremetal: also needs the kdump fix posted by Neil 
Horman yesterday.

Note for x86_64/xen: needs kexec-tools update.  I've tested with 
git://git.kernel.org/pub/scm/linux/kernel/git/horms/kexec-tools-testing.git.

cheers,
  Gerd


combined patch, xen dom0 kexec implementation, linux kernel bits

I've bundled all up into one big patch, including the fixes added after
the initial merge into the xen-unstable tree.  I think it is much easier
to review that way, especially as the initial merge was quite messy and
was cleaned up later on.

xen-unstable changesets, initial kexec merge

12623-XEN_Kexec__Kdump_Generic_code
12624-XEN_Kexec__Kdump_Code_shared_between_x86_32_and_x86_64
12625-XEN_Kexec__Kdump_x86_32_specific_code
12626-XEN_Kexec__Kdump_x86_64_specific_code

xen-unstable changesets, kexec bugfixes

12654-LINUX_Only_initialise_kexec_in_domain0.
12658-LINUX_Fix_backward_compatibility_with_hypervisors_which_do_not_support_kexec.
12733-LINUX_The_crash_note_resource_should_be_nested_inside_the_hypervisor
12766-Remove_unused_KEXEC_RANGE_VA_XEN.
12804-LINUX_Kexec_Do_not_bug_if_a_kexec_hypercall_fails.
12807-LINUX_Kexec_Reapply_kexec_patches_to_sparse_tree.
12808-LINUX_Kexec_Remove_remainder_of_kexec-generic.patch
13309-LINUX_Kexec_Dont_try_to_load_an_NULL_image._This_can_occur_when_unloading.
13317-LINUX_Clarify_comment_in_xen_machine_kexec_setup_resources.
14372-LINUX_Kexec_Ensure_that_pages_allocated_for_kexec_have_MFNs_which_are_within

---
 arch/i386/Kconfig                           |    2 
 arch/i386/kernel/crash.c                    |    4 
 arch/i386/kernel/machine_kexec.c            |   40 +++++
 arch/i386/kernel/setup-xen.c                |   19 ++
 arch/x86_64/Kconfig                         |    2 
 arch/x86_64/kernel/crash.c                  |    6 
 arch/x86_64/kernel/e820-xen.c               |    6 
 arch/x86_64/kernel/machine_kexec.c          |  118 ++++++++++++++++-
 arch/x86_64/kernel/setup-xen.c              |   13 +
 drivers/xen/core/Makefile                   |    1 
 drivers/xen/core/machine_kexec.c            |  190 ++++++++++++++++++++++++++++
 include/asm-i386/kexec.h                    |   14 ++
 include/asm-i386/mach-xen/asm/hypercall.h   |    8 +
 include/asm-x86_64/kexec.h                  |   13 +
 include/asm-x86_64/mach-xen/asm/hypercall.h |    7 +
 include/asm-x86_64/mach-xen/asm/ptrace.h    |    2 
 include/linux/kexec.h                       |   13 +
 include/xen/interface/elfnote.h             |   19 ++
 include/xen/interface/kexec.h               |  137 ++++++++++++++++++++
 kernel/kexec.c                              |   73 ++++++++--
 20 files changed, 661 insertions(+), 26 deletions(-)

Index: linux-2.6.18.noarch/drivers/xen/core/Makefile
===================================================================
--- linux-2.6.18.noarch.orig/drivers/xen/core/Makefile
+++ linux-2.6.18.noarch/drivers/xen/core/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SKBUFF)	+= skbuff.o
 obj-$(CONFIG_XEN_REBOOT)	+= reboot.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o
Index: linux-2.6.18.noarch/drivers/xen/core/machine_kexec.c
===================================================================
--- /dev/null
+++ linux-2.6.18.noarch/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,190 @@
+/*
+ * drivers/xen/core/machine_kexec.c
+ * handle transition of Linux booting another kernel
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <asm/hypercall.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,
+					 struct kimage *image);
+
+int xen_max_nr_phys_cpus;
+struct resource xen_hypervisor_res;
+struct resource *xen_phys_cpus;
+
+void xen_machine_kexec_setup_resources(void)
+{
+	xen_kexec_range_t range;
+	struct resource *res;
+	int k = 0;
+
+	if (!is_initial_xendomain())
+		return;
+
+	/* determine maximum number of physical cpus */
+
+	while (1) {
+		memset(&range, 0, sizeof(range));
+		range.range = KEXEC_RANGE_MA_CPU;
+		range.nr = k;
+
+		if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+			break;
+
+		k++;
+	}
+
+	if (k == 0)
+		return;
+
+	xen_max_nr_phys_cpus = k;
+
+	/* allocate xen_phys_cpus */
+
+	xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
+	BUG_ON(xen_phys_cpus == NULL);
+
+	/* fill in xen_phys_cpus with per-cpu crash note information */
+
+	for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+		memset(&range, 0, sizeof(range));
+		range.range = KEXEC_RANGE_MA_CPU;
+		range.nr = k;
+
+		if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+			goto err;
+
+		res = xen_phys_cpus + k;
+
+		memset(res, 0, sizeof(*res));
+		res->name = "Crash note";
+		res->start = range.start;
+		res->end = range.start + range.size - 1;
+		res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	}
+
+	/* fill in xen_hypervisor_res with hypervisor machine address range */
+
+	memset(&range, 0, sizeof(range));
+	range.range = KEXEC_RANGE_MA_XEN;
+
+	if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+		goto err;
+
+	xen_hypervisor_res.name = "Hypervisor code and data";
+	xen_hypervisor_res.start = range.start;
+	xen_hypervisor_res.end = range.start + range.size - 1;
+	xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+
+	/* fill in crashk_res if range is reserved by hypervisor */
+
+	memset(&range, 0, sizeof(range));
+	range.range = KEXEC_RANGE_MA_CRASH;
+
+	if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+		return;
+
+	if (range.size) {
+		crashk_res.start = range.start;
+		crashk_res.end = range.start + range.size - 1;
+	}
+
+	return;
+
+ err:
+	/*
+	 * It isn't possible to free xen_phys_cpus this early in the
+	 * boot. Failure at this stage is unexpected and the amount of
+	 * memory is small therefore we tolerate the potential leak.
+         */
+	xen_max_nr_phys_cpus = 0;
+	return;
+}
+
+void xen_machine_kexec_register_resources(struct resource *res)
+{
+	int k;
+
+	request_resource(res, &xen_hypervisor_res);
+
+	for (k = 0; k < xen_max_nr_phys_cpus; k++)
+		request_resource(&xen_hypervisor_res, xen_phys_cpus + k);
+
+}
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	machine_kexec_setup_load_arg(xki, image);
+
+	xki->indirection_page = image->head;
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_load_t xkl;
+
+	memset(&xkl, 0, sizeof(xkl));
+	xkl.type = image->type;
+	setup_load_arg(&xkl.image, image);
+	return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	xen_kexec_load_t xkl;
+
+	memset(&xkl, 0, sizeof(xkl));
+	xkl.type = image->type;
+	HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU,
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	xen_kexec_exec_t xke;
+
+	memset(&xke, 0, sizeof(xke));
+	xke.type = image->type;
+	HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+void machine_shutdown(void)
+{
+	/* do nothing */
+}
+
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
Index: linux-2.6.18.noarch/include/xen/interface/elfnote.h
===================================================================
--- linux-2.6.18.noarch.orig/include/xen/interface/elfnote.h
+++ linux-2.6.18.noarch/include/xen/interface/elfnote.h
@@ -120,6 +120,25 @@
  */
 #define XEN_ELFNOTE_BSD_SYMTAB    11
 
+/*
+ * System information exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
+ * note in case of a system crash. This note will contain various
+ * information about the system, see xen/include/xen/elfcore.h.
+ */
+#define XEN_ELFNOTE_CRASH_INFO 0x1000001
+
+/*
+ * System registers exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
+ * note per cpu in case of a system crash. This note is architecture
+ * specific and will contain registers not saved in the "CORE" note.
+ * See xen/include/xen/elfcore.h for more information.
+ */
+#define XEN_ELFNOTE_CRASH_REGS 0x1000002
+
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
 
 /*
Index: linux-2.6.18.noarch/include/xen/interface/kexec.h
===================================================================
--- /dev/null
+++ linux-2.6.18.noarch/include/xen/interface/kexec.h
@@ -0,0 +1,137 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ *
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@verge.net.au>
+ * - Magnus Damm <magnus@valinux.co.jp>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+
+/* This file describes the Kexec / Kdump hypercall interface for Xen.
+ *
+ * Kexec under vanilla Linux allows a user to reboot the physical machine
+ * into a new user-specified kernel. The Xen port extends this idea
+ * to allow rebooting of the machine from dom0. When kexec for dom0
+ * is used to reboot,  both the hypervisor and the domains get replaced
+ * with some other kernel. It is possible to kexec between vanilla
+ * Linux and Xen and back again. Xen to Xen works well too.
+ *
+ * The hypercall interface for kexec can be divided into three main
+ * types of hypercall operations:
+ *
+ * 1) Range information:
+ *    This is used by the dom0 kernel to ask the hypervisor about various
+ *    address information. This information is needed to allow kexec-tools
+ *    to fill in the ELF headers for /proc/vmcore properly.
+ *
+ * 2) Load and unload of images:
+ *    There are no big surprises here, the kexec binary from kexec-tools
+ *    runs in userspace in dom0. The tool loads/unloads data into the
+ *    dom0 kernel such as new kernel, initramfs and hypervisor. When
+ *    loaded the dom0 kernel performs a load hypercall operation, and
+ *    before releasing all page references the dom0 kernel calls unload.
+ *
+ * 3) Kexec operation:
+ *    This is used to start a previously loaded kernel.
+ */
+
+#include "xen.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#define KEXEC_XEN_NO_PAGES 17
+#endif
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, void *args)
+ * @cmd  == KEXEC_CMD_...
+ *          KEXEC operation to perform
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Kexec supports two types of operation:
+ * - kexec into a regular kernel, very similar to a standard reboot
+ *   - KEXEC_TYPE_DEFAULT is used to specify this type
+ * - kexec into a special "crash kernel", aka kexec-on-panic
+ *   - KEXEC_TYPE_CRASH is used to specify this type
+ *   - parts of our system may be broken at kexec-on-panic time
+ *     - the code should be kept as simple and self-contained as possible
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+
+/* The kexec implementation for Xen allows the user to load two
+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
+ * per "instance". The data mainly consists of machine address lists to pages
+ * together with destination addresses. The data in xen_kexec_image_t
+ * is passed to the "code page" which is one page of code that performs
+ * the final relocations before jumping to the new kernel.
+ */
+
+typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+    unsigned long page_list[KEXEC_XEN_NO_PAGES];
+#endif
+    unsigned long indirection_page;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ */
+#define KEXEC_CMD_kexec                 0
+typedef struct xen_kexec_exec {
+    int type;
+} xen_kexec_exec_t;
+
+/*
+ * Load/Unload kernel image for kexec or kdump.
+ * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ * image == relocation information for kexec (ignored for unload) [in]
+ */
+#define KEXEC_CMD_kexec_load            1
+#define KEXEC_CMD_kexec_unload          2
+typedef struct xen_kexec_load {
+    int type;
+    xen_kexec_image_t image;
+} xen_kexec_load_t;
+
+#define KEXEC_RANGE_MA_CRASH 0   /* machine address and size of crash area */
+#define KEXEC_RANGE_MA_XEN   1   /* machine address and size of Xen itself */
+#define KEXEC_RANGE_MA_CPU   2   /* machine address and size of a CPU note */
+
+/*
+ * Find the address and size of certain memory areas
+ * range == KEXEC_RANGE_... [in]
+ * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
+ * size  == number of bytes reserved in window [out]
+ * start == address of the first byte in the window [out]
+ */
+#define KEXEC_CMD_kexec_get_range       3
+typedef struct xen_kexec_range {
+    int range;
+    int nr;
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_range_t;
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
Index: linux-2.6.18.noarch/arch/i386/Kconfig
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/Kconfig
+++ linux-2.6.18.noarch/arch/i386/Kconfig
@@ -777,7 +777,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
Index: linux-2.6.18.noarch/arch/i386/kernel/setup-xen.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/setup-xen.c
+++ linux-2.6.18.noarch/arch/i386/kernel/setup-xen.c
@@ -67,6 +67,10 @@
 #include "setup_arch.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -934,6 +938,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -941,6 +946,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1342,6 +1351,9 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifdef CONFIG_XEN
+	xen_machine_kexec_setup_resources();
+#else
 	if ((crashk_res.start < crashk_res.end) &&
 	    (crashk_res.end <= (max_low_pfn << PAGE_SHIFT))) {
 		reserve_bootmem(crashk_res.start,
@@ -1355,6 +1367,7 @@ void __init setup_bootmem_allocator(void
 		crashk_res.start = 0;
 	}
 #endif
+#endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		phys_to_machine_mapping =
@@ -1435,7 +1448,11 @@ legacy_init_iomem_resources(struct resou
 			request_resource(res, data_resource);
 #endif
 #ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
+			if (crashk_res.start != crashk_res.end)
+			     request_resource(res, &crashk_res);
+#ifdef CONFIG_XEN
+			xen_machine_kexec_register_resources(res);
+#endif
 #endif
 		}
 	}
Index: linux-2.6.18.noarch/include/asm-i386/mach-xen/asm/hypercall.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-i386/mach-xen/asm/hypercall.h
+++ linux-2.6.18.noarch/include/asm-i386/mach-xen/asm/hypercall.h
@@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec_op(
+	unsigned long op, void *args)
+{
+	return _hypercall2(int, kexec_op, op, args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
Index: linux-2.6.18.noarch/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/Kconfig
+++ linux-2.6.18.noarch/arch/x86_64/Kconfig
@@ -531,7 +531,7 @@ config X86_MCE_AMD
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_64_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
Index: linux-2.6.18.noarch/arch/x86_64/kernel/e820-xen.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/e820-xen.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/e820-xen.c
@@ -301,7 +301,11 @@ void __init e820_reserve_resources(struc
 			request_resource(res, &data_resource);
 #endif
 #ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
+			if (crashk_res.start != crashk_res.end)
+				request_resource(res, &crashk_res);
+#ifdef CONFIG_XEN
+			xen_machine_kexec_register_resources(res);
+#endif
 #endif
 		}
 	}
Index: linux-2.6.18.noarch/arch/x86_64/kernel/setup-xen.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/setup-xen.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/setup-xen.c
@@ -76,6 +76,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 EXPORT_SYMBOL_GPL(edid_info);
@@ -464,6 +468,7 @@ static __init void parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -471,6 +476,10 @@ static __init void parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 
@@ -758,6 +767,9 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif	/* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifdef CONFIG_XEN
+	xen_machine_kexec_setup_resources();
+#else
 	if ((crashk_res.start < crashk_res.end) &&
 	    (crashk_res.end <= (max_low_pfn << PAGE_SHIFT))) {
 		reserve_bootmem_generic(crashk_res.start,
@@ -771,6 +783,7 @@ void __init setup_arch(char **cmdline_p)
 		crashk_res.start = 0;
 	}
 #endif
+#endif
 
 	paging_init();
 #ifdef CONFIG_X86_LOCAL_APIC
Index: linux-2.6.18.noarch/include/asm-x86_64/mach-xen/asm/hypercall.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ linux-2.6.18.noarch/include/asm-x86_64/mach-xen/asm/hypercall.h
@@ -386,4 +386,11 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec_op(
+	unsigned long op, void *args)
+{
+	return _hypercall2(int, kexec_op, op, args);
+}
+
 #endif /* __HYPERCALL_H__ */
Index: linux-2.6.18.noarch/include/asm-x86_64/mach-xen/asm/ptrace.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ linux-2.6.18.noarch/include/asm-x86_64/mach-xen/asm/ptrace.h
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
Index: linux-2.6.18.noarch/arch/i386/kernel/crash.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/crash.c
+++ linux-2.6.18.noarch/arch/i386/kernel/crash.c
@@ -90,6 +90,7 @@ static void crash_save_self(struct pt_re
 	crash_save_this_cpu(regs, cpu);
 }
 
+#ifndef CONFIG_XEN
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static atomic_t waiting_for_crash_ipi;
 
@@ -154,6 +155,7 @@ static void nmi_shootdown_cpus(void)
 	/* There are no cpus to shootdown */
 }
 #endif
+#endif /* CONFIG_XEN */
 
 void machine_crash_shutdown(struct pt_regs *regs)
 {
@@ -170,10 +172,12 @@ void machine_crash_shutdown(struct pt_re
 
 	/* Make a note of crashing cpu. Will be used in NMI callback.*/
 	crashing_cpu = smp_processor_id();
+#ifndef CONFIG_XEN
 	nmi_shootdown_cpus();
 	lapic_shutdown();
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();
 #endif
+#endif /* CONFIG_XEN */
 	crash_save_self(regs);
 }
Index: linux-2.6.18.noarch/arch/i386/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/i386/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/i386/kernel/machine_kexec.c
@@ -19,6 +19,10 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
 static u32 kexec_pgd[1024] PAGE_ALIGNED;
 #ifdef CONFIG_X86_PAE
@@ -28,6 +32,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
 static u32 kexec_pte0[1024] PAGE_ALIGNED;
 static u32 kexec_pte1[1024] PAGE_ALIGNED;
 
+#ifdef CONFIG_XEN
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	void *control_page;
+
+	memset(xki->page_list, 0, sizeof(xki->page_list));
+
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+	xki->page_list[PA_PGD] = __ma(kexec_pgd);
+#ifdef CONFIG_X86_PAE
+	xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
+	xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
+#endif
+	xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
+	xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+
+}
+
+#endif /* CONFIG_XEN */
+
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -54,6 +92,7 @@ void machine_kexec_cleanup(struct kimage
 {
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -87,3 +126,4 @@ NORET_TYPE void machine_kexec(struct kim
 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
 			image->start, cpu_has_pae);
 }
+#endif
Index: linux-2.6.18.noarch/arch/x86_64/kernel/crash.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/crash.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/crash.c
@@ -92,6 +92,7 @@ static void crash_save_self(struct pt_re
 	crash_save_this_cpu(regs, cpu);
 }
 
+#ifndef CONFIG_XEN
 #ifdef CONFIG_SMP
 static atomic_t waiting_for_crash_ipi;
 
@@ -156,6 +157,7 @@ static void nmi_shootdown_cpus(void)
 	/* There are no cpus to shootdown */
 }
 #endif
+#endif /* CONFIG_XEN */
 
 void machine_crash_shutdown(struct pt_regs *regs)
 {
@@ -173,6 +175,8 @@ void machine_crash_shutdown(struct pt_re
 
 	/* Make a note of crashing cpu. Will be used in NMI callback.*/
 	crashing_cpu = smp_processor_id();
+
+#ifndef CONFIG_XEN
 	nmi_shootdown_cpus();
 
 	if(cpu_has_apic)
@@ -181,6 +185,6 @@ void machine_crash_shutdown(struct pt_re
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();
 #endif
-
+#endif /* CONFIG_XEN */
 	crash_save_self(regs);
 }
Index: linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/arch/x86_64/kernel/machine_kexec.c
+++ linux-2.6.18.noarch/arch/x86_64/kernel/machine_kexec.c
@@ -24,6 +24,104 @@ static u64 kexec_pud1[512] PAGE_ALIGNED;
 static u64 kexec_pmd1[512] PAGE_ALIGNED;
 static u64 kexec_pte1[512] PAGE_ALIGNED;
 
+#ifdef CONFIG_XEN
+
+/* In the case of Xen, override hypervisor functions to be able to create
+ * a regular identity mapping page table...
+ */
+
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+
+#define x__pmd(x) ((pmd_t) { (x) } )
+#define x__pud(x) ((pud_t) { (x) } )
+#define x__pgd(x) ((pgd_t) { (x) } )
+
+#define x_pmd_val(x)   ((x).pmd)
+#define x_pud_val(x)   ((x).pud)
+#define x_pgd_val(x)   ((x).pgd)
+
+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
+{
+	x_pmd_val(*dst) = x_pmd_val(val);
+}
+
+static inline void x_set_pud(pud_t *dst, pud_t val)
+{
+	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
+}
+
+static inline void x_pud_clear (pud_t *pud)
+{
+	x_pud_val(*pud) = 0;
+}
+
+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
+{
+	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
+}
+
+static inline void x_pgd_clear (pgd_t * pgd)
+{
+	x_pgd_val(*pgd) = 0;
+}
+
+#define X__PAGE_KERNEL_LARGE_EXEC \
+         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	void *control_page;
+	void *table_page;
+
+	memset(xki->page_list, 0, sizeof(xki->page_list));
+
+	control_page = page_address(image->control_code_page) + PAGE_SIZE;
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+	table_page = page_address(image->control_code_page);
+
+	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+	xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
+
+	xki->page_list[PA_PGD] = __ma(kexec_pgd);
+	xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
+	xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
+	xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
+	xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
+	xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
+	xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+}
+
+#else /* CONFIG_XEN */
+
+#define x__pmd(x) __pmd(x)
+#define x__pud(x) __pud(x)
+#define x__pgd(x) __pgd(x)
+
+#define x_set_pmd(x, y) set_pmd(x, y)
+#define x_set_pud(x, y) set_pud(x, y)
+#define x_set_pgd(x, y) set_pgd(x, y)
+
+#define x_pud_clear(x) pud_clear(x)
+#define x_pgd_clear(x) pgd_clear(x)
+
+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#define X_KERNPG_TABLE _KERNPG_TABLE
+
+#endif /* CONFIG_XEN */
+
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
@@ -31,7 +129,7 @@ static void init_level2_page(pmd_t *leve
 	addr &= PAGE_MASK;
 	end_addr = addr + PUD_SIZE;
 	while (addr < end_addr) {
-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
 		addr += PMD_SIZE;
 	}
 }
@@ -56,12 +154,12 @@ static int init_level3_page(struct kimag
 		}
 		level2p = (pmd_t *)page_address(page);
 		init_level2_page(level2p, addr);
-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+		x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
 		addr += PUD_SIZE;
 	}
 	/* clear the unused entries */
 	while (addr < end_addr) {
-		pud_clear(level3p++);
+		x_pud_clear(level3p++);
 		addr += PUD_SIZE;
 	}
 out:
@@ -92,12 +190,12 @@ static int init_level4_page(struct kimag
 		if (result) {
 			goto out;
 		}
-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+		x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
 		addr += PGDIR_SIZE;
 	}
 	/* clear the unused entries */
 	while (addr < end_addr) {
-		pgd_clear(level4p++);
+		x_pgd_clear(level4p++);
 		addr += PGDIR_SIZE;
 	}
 out:
@@ -108,8 +206,14 @@ out:
 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
 	pgd_t *level4p;
+	unsigned long x_end_pfn = end_pfn;
+
+#ifdef CONFIG_XEN
+	x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+#endif
+
 	level4p = (pgd_t *)__va(start_pgtable);
- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ 	return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
 }
 
 int machine_kexec_prepare(struct kimage *image)
@@ -133,6 +237,7 @@ void machine_kexec_cleanup(struct kimage
 	return;
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -171,3 +276,4 @@ NORET_TYPE void machine_kexec(struct kim
 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
 			image->start);
 }
+#endif
Index: linux-2.6.18.noarch/include/asm-i386/kexec.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-i386/kexec.h
+++ linux-2.6.18.noarch/include/asm-i386/kexec.h
@@ -101,6 +101,20 @@ relocate_kernel(unsigned long indirectio
 		unsigned long start_address,
 		unsigned int has_pae) ATTRIB_NORET;
 
+
+/* Under Xen we need to work with machine addresses. These macros give the
+ * machine address of a certain page to the generic kexec code instead of
+ * the pseudo physical address which would be given by the default macros.
+ */
+
+#ifdef CONFIG_XEN
+#define KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _I386_KEXEC_H */
Index: linux-2.6.18.noarch/include/asm-x86_64/kexec.h
===================================================================
--- linux-2.6.18.noarch.orig/include/asm-x86_64/kexec.h
+++ linux-2.6.18.noarch/include/asm-x86_64/kexec.h
@@ -91,6 +91,19 @@ relocate_kernel(unsigned long indirectio
 		unsigned long page_list,
 		unsigned long start_address) ATTRIB_NORET;
 
+/* Under Xen we need to work with machine addresses. These macros give the
+ * machine address of a certain page to the generic kexec code instead of
+ * the pseudo physical address which would be given by the default macros.
+ */
+
+#ifdef CONFIG_XEN
+#define KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _X86_64_KEXEC_H */
Index: linux-2.6.18.noarch/include/linux/kexec.h
===================================================================
--- linux-2.6.18.noarch.orig/include/linux/kexec.h
+++ linux-2.6.18.noarch/include/linux/kexec.h
@@ -31,6 +31,13 @@
 #error KEXEC_ARCH not defined
 #endif
 
+#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
 /*
  * This structure is used to hold the arguments that are used when loading
  * kernel binaries.
@@ -91,6 +98,12 @@ struct kimage {
 extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
+#ifdef CONFIG_XEN
+extern int xen_machine_kexec_load(struct kimage *image);
+extern void xen_machine_kexec_unload(struct kimage *image);
+extern void xen_machine_kexec_setup_resources(void);
+extern void xen_machine_kexec_register_resources(struct resource *res);
+#endif
 extern asmlinkage long sys_kexec_load(unsigned long entry,
 					unsigned long nr_segments,
 					struct kexec_segment __user *segments,
Index: linux-2.6.18.noarch/kernel/kexec.c
===================================================================
--- linux-2.6.18.noarch.orig/kernel/kexec.c
+++ linux-2.6.18.noarch/kernel/kexec.c
@@ -330,13 +330,27 @@ static int kimage_is_destination_range(s
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit)
 {
 	struct page *pages;
 
 	pages = alloc_pages(gfp_mask, order);
 	if (pages) {
 		unsigned int count, i;
+#ifdef CONFIG_XEN
+		int address_bits;
+
+		if (limit == ~0UL)
+			address_bits = BITS_PER_LONG;
+		else
+			address_bits = long_log2(limit);
+
+		if (xen_create_contiguous_region((unsigned long)page_address(pages),
+						 order, address_bits) < 0) {
+			__free_pages(pages, order);
+			return NULL;
+		}
+#endif
 		pages->mapping = NULL;
 		set_page_private(pages, order);
 		count = 1 << order;
@@ -355,6 +369,9 @@ static void kimage_free_pages(struct pag
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
+#ifdef CONFIG_XEN
+	xen_destroy_contiguous_region((unsigned long)page_address(page), order);
+#endif
 	__free_pages(page, order);
 }
 
@@ -400,10 +417,10 @@ static struct page *kimage_alloc_normal_
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
 
-		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = kexec_page_to_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -437,6 +454,7 @@ static struct page *kimage_alloc_normal_
 	return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -490,7 +508,7 @@ static struct page *kimage_alloc_crash_c
 		}
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
 			break;
 		}
 	}
@@ -517,6 +535,13 @@ struct page *kimage_alloc_control_pages(
 
 	return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -532,7 +557,7 @@ static int kimage_add_entry(struct kimag
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -593,13 +618,13 @@ static int kimage_terminate(struct kimag
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION)? \
-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -611,6 +636,10 @@ static void kimage_free(struct kimage *i
 	if (!image)
 		return;
 
+#ifdef CONFIG_XEN
+	xen_machine_kexec_unload(image);
+#endif
+
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
 		if (entry & IND_INDIRECTION) {
@@ -686,7 +715,7 @@ static struct page *kimage_alloc_page(st
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -697,16 +726,16 @@ static struct page *kimage_alloc_page(st
 		kimage_entry_t *old;
 
 		/* Allocate a page, if we run out of memory give up */
-		page = kimage_alloc_pages(gfp_mask, 0);
+		page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT);
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (kexec_page_to_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -729,7 +758,7 @@ static struct page *kimage_alloc_page(st
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -779,7 +808,7 @@ static int kimage_load_normal_segment(st
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, kexec_page_to_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -811,6 +840,7 @@ out:
 	return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -833,7 +863,7 @@ static int kimage_load_crash_segment(str
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (page == 0) {
 			result  = -ENOMEM;
 			goto out;
@@ -882,6 +912,13 @@ static int kimage_load_segment(struct ki
 
 	return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
@@ -992,6 +1029,13 @@ asmlinkage long sys_kexec_load(unsigned 
 		if (result)
 			goto out;
 	}
+#ifdef CONFIG_XEN
+	if (image) {
+		result = xen_machine_kexec_load(image);
+		if (result)
+			goto out;
+	}
+#endif
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
 
@@ -1045,7 +1089,6 @@ void crash_kexec(struct pt_regs *regs)
 {
 	int locked;
 
-
 	/* Take the kexec_lock here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
 	 * we are using after a panic on a different cpu.


--- linux-2.6.18.noarch/include/linux/kexec.h.kabi	2007-06-21 14:49:59.000000000 +0200
+++ linux-2.6.18.noarch/include/linux/kexec.h	2007-06-21 15:08:22.000000000 +0200
@@ -5,7 +5,9 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/linkage.h>
+#if !defined(__GENKSYMS__) || !defined(CONFIG_XEN)
 #include <linux/compat.h>
+#endif
 #include <linux/ioport.h>
 #include <asm/kexec.h>