Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 2604

kernel-2.6.18-128.1.10.el5.src.rpm

From: Don Dutile <ddutile@redhat.com>
Date: Thu, 31 Jan 2008 15:44:59 -0500
Subject: [xen] 32-bit pv guest migration can fail under load
Message-id: 47A2334B.9000000@redhat.com
O-Subject: [RHEL5.2 PATCH] [Bug 425471] Save/restore/migration of 32-bit pv guests can fail under load
Bugzilla: 425471

BZ 425471

Problem Description
===================
If migrating a 32-bit, i686 RHEL5 domU/guest, which is under interrupt load,
say a parallel kernel make, the restore may fail due to a stack overflow
during the secondary cpu setup.

Xen starts up the primary cpu in a guest, then effectively hotplugs additional (v)cpus.
Thus, unlike the timing of initial system boot, where the processors are
all initialized and then user programs are executed, a save/restore or
migrating Xen guest can have active, user load during secondary cpu initialization.

In cpu_initialize_context() (in drivers/xen/core/smpboot.c),
a 2800 byte context structure was defined on the stack.
i386/i686 stacks are only 4K.  It's not nice to use 2800 bytes
out of 4096, esp. when that function is 5->6 functions deep in the
call stack.  Add an interrupt on that stack (until do-irq switches
to the interrupt stack), and stack overflow occurs.

This problem had existed from the beginning; well hidden since one
needs to load the guest and have enough memory (>= 1G) and enough
vpus (4 in this case) to generate enough (virtual) interrupts to have
all the stars line up  ... in this case, about 100 save/restore iterations.

Fix
====
kmalloc & free context struct.

Upstream status
===============
A very similar file is in LKML, for Xen pv-ops.
It was modified to use a kzalloc'd & free'd context structure.
I used kmalloc() & memset() in case that function was backported to
older, non-kzalloc() kernels, allowing the function to be more portable.
Oddly enough, the Xen pv-ops version was done by Jeremy Fitzhardinge of XenSource,
but didn't push the fix/modification to XS's linux-2.6.18-xen.hg .

A patch compatible to xen-unstable was posted upstream for inclusion by XenSource.

Testing
========
A save/restore loop running on a rhel5.1 dom0, against a  rhel5.2-75-modified domU guest,
with the guest running a make -j5 of a rhel5 tree in a loop, would typically
stall in the restore with a stack overflow, that eventually tied itself into
a knot of failing page fault handlings, after about 100 iterations(couple hours) of the save/restore loop.
After the patch is applied, the save/restore loop ran for over 650 iterations (overnight), after which
I stopped it due to other testing (port to rhel4-xenU, reconfigure test setup, etc., post
patch to xen-devel).
Note: the pre-76 rhel5 kernels had a problem with user proc's being tied to cpu0.
       In order to get around that limitation, and ensure make -j5 was executing across
       all 4 vcpus, a 'taskset -c 0-3 bash' was executed, and 'top' was used to prove
       that the compiles were running on all 4 vcpus (to generate enough intr. load).

Debug kudo's to Dave Anderson & his crash utility.  These stack traces are a ton of fun to
pick apart (according to Dave, not me...).

Please review (attached patch) & ACK.

- Don

Acked-by: Bill Burns <bburns@redhat.com>
Acked-by: Chris Lalancette <clalance@redhat.com>
Acked-by: Dave Anderson <anderson@redhat.com>

diff --git a/drivers/xen/core/smpboot.c b/drivers/xen/core/smpboot.c
index 2a4513c..99cdbf1 100644
--- a/drivers/xen/core/smpboot.c
+++ b/drivers/xen/core/smpboot.c
@@ -160,9 +160,9 @@ static void cpu_bringup_and_idle(void)
 	cpu_idle();
 }
 
-static void cpu_initialize_context(unsigned int cpu)
+static int cpu_initialize_context(unsigned int cpu)
 {
-	vcpu_guest_context_t ctxt;
+	vcpu_guest_context_t *ctxt;
 	struct task_struct *idle = idle_task(cpu);
 #ifdef __x86_64__
 	struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
@@ -171,58 +171,65 @@ static void cpu_initialize_context(unsigned int cpu)
 #endif
 
 	if (cpu_test_and_set(cpu, cpu_initialized_map))
-		return;
+		return 0;
 
-	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (ctxt == NULL)
+		return -ENOMEM;
 
-	ctxt.flags = VGCF_IN_KERNEL;
-	ctxt.user_regs.ds = __USER_DS;
-	ctxt.user_regs.es = __USER_DS;
-	ctxt.user_regs.fs = 0;
-	ctxt.user_regs.gs = 0;
-	ctxt.user_regs.ss = __KERNEL_DS;
-	ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-	ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
+	memset(ctxt, 0, sizeof(*ctxt));
 
-	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.fs = 0;
+	ctxt->user_regs.gs = 0;
+	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
 
-	smp_trap_init(ctxt.trap_ctxt);
+	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
-	ctxt.ldt_ents = 0;
+	smp_trap_init(ctxt->trap_ctxt);
 
-	ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
-	ctxt.gdt_ents      = gdt_descr->size / 8;
+	ctxt->ldt_ents = 0;
+
+	ctxt->gdt_frames[0] = virt_to_mfn(gdt_descr->address);
+	ctxt->gdt_ents      = gdt_descr->size / 8;
 
 #ifdef __i386__
-	ctxt.user_regs.cs = __KERNEL_CS;
-	ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
 
-	ctxt.kernel_ss = __KERNEL_DS;
-	ctxt.kernel_sp = idle->thread.esp0;
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.esp0;
 
-	ctxt.event_callback_cs     = __KERNEL_CS;
-	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
-	ctxt.failsafe_callback_cs  = __KERNEL_CS;
-	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->event_callback_eip    = (unsigned long)hypervisor_callback;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
+	ctxt->failsafe_callback_eip = (unsigned long)failsafe_callback;
 
-	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
 #else /* __x86_64__ */
-	ctxt.user_regs.cs = __KERNEL_CS;
-	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
 
-	ctxt.kernel_ss = __KERNEL_DS;
-	ctxt.kernel_sp = idle->thread.rsp0;
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.rsp0;
 
-	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
-	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
-	ctxt.syscall_callback_eip  = (unsigned long)system_call;
+	ctxt->event_callback_eip    = (unsigned long)hypervisor_callback;
+	ctxt->failsafe_callback_eip = (unsigned long)failsafe_callback;
+	ctxt->syscall_callback_eip  = (unsigned long)system_call;
 
-	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
 
-	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
+	ctxt->gs_base_kernel = (unsigned long)(cpu_pda(cpu));
 #endif
 
-	BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
+	BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt));
+
+	kfree(ctxt);
+	return 0;
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
@@ -401,7 +408,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	rc = cpu_up_check(cpu);
 	if (rc)
 		return rc;
-	cpu_initialize_context(cpu);
+	rc = cpu_initialize_context(cpu);
+	if (rc)
+		return rc;
 
 	if (num_online_cpus() == 1)
 		alternatives_smp_switch(1);