Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4242

kernel-2.6.18-194.11.1.el5.src.rpm

From: Bill Burns <bburns@redhat.com>
Date: Tue, 9 Sep 2008 22:15:30 -0400
Subject: [xen] fix crash on IRQ exhaustion and increase NR_IRQS
Message-id: 48C72DC2.70300@redhat.com
O-Subject: Re: [RHEL5.3 PATCH 0/1] Xen Fix crash on IRQ exhaustion and increase NR_IRQS
Bugzilla: 442736
RH-Acked-by: Chris Lalancette <clalance@redhat.com>
RH-Acked-by: Markus Armbruster <armbru@redhat.com>
RH-Acked-by: Don Dutile <ddutile@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>

Fixes bz 442736

Problem description:

Running out of IRQs when creating Xen guests causes a system
crash. Upstream patches exist that makes the guest creation
fail and avoids the crash. Also the number of IRQs provided
restricts the number of guests that can be created on
large systems and causes less guests to be available when
more physical CPUs are present. The existing limit tops out at
around 70 guests on a 64 CPU system. (Note that each physical
CPU consumes 3 IRQs and each guest uses one for each disk
and network device).

This patch adds 768 IRQs which takes into account the
additional 192 IRQs needed to move up from 64
CPUs to 128 CPUs. This leaves 576 for additional for
guests. The change to the number of IRQs is
x86_64 and Xen specific, while the crash avoidance is
architecture neutral.

kabi issue:

Increasing the number of dynamic IRQs effects the
NR_IRQS define, which is used in the kernel_stat
structure and that structure ends up being part of
the kabi as cpu__kstat. Several avenues to address
this were considered to maintain NR_IRQS as the
same value. The issue with IRQs above the range
of NR_IRQS is problematic since there is a wide
distribution of code in the kenrel proper that
uses NR_IRQS as a bounds check and irq numbers
as indexes into arrays bounded by NR_IRQS. Having
a false NR_IRQS for a lower amount runs the very
real risk of causing corruption when an irq number
is used to index a smaller array.

Making NR_IRQS accurate is far superior as it
avoids changing a large amount of code that
is currently working and it avoids the corruption
risk.

Since the one kabi issue is with the kernel_stat
structure and the difference in that structure
is an addition to the end (increasing the array
that is the last element in the structure) the
proposed solution is to use __GENKSYMS__ to
contorl the dynamic IRQS like this:

 #define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
+#ifdef __GENKSYMS__
 #define NR_DYNIRQS             256
+#else /* __GENKSYMS__ */
+#define NR_DYNIRQS             1024
+#endif /* __GENKSYMS__ */

Thus the kernel is built consistently with the new
value and the ksyms build uses the old and the only
exposed structure is:

struct kernel_stat {
        struct cpu_usage_stat   cpustat;
        unsigned int irqs[NR_IRQS];
};

Someone using the old value will not read the entire
kernel structure, but the smaller amount, which is at
least safe.

Upstream status:
In Xen unstable, the fix for crashing when IRQS are exhausted is here:
http://xenbits.xensource.com/xen-unstable.hg?rev/ff5f976191a5

changeset 12790:  	ff5f976191a5
parent 12789:	ec9259920f85
child 12792:	cf11417d7eb6
author: 	kfraser@localhost.localdomain
date: 	Thu Dec 07 16:43:08 2006 +0000 (17 months ago)
files: 	linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c linux-2.6-xen-sparse/drivers/xen/core/evtchn.c linux-2.6-xen-sparse/drivers/xen/core/smpboot.c linux-2.6-xen-sparse/drivers/xen/netback/netback.c
description: 	[LINUX] Fail gracefully if we run out of spare IRQs.
Signed-off-by: Keir Fraser <keir@xensource.com>

diff --git a/arch/i386/kernel/time-xen.c b/arch/i386/kernel/time-xen.c
index 2c51117..6f23a83 100644
--- a/arch/i386/kernel/time-xen.c
+++ b/arch/i386/kernel/time-xen.c
@@ -1057,9 +1057,9 @@ void time_resume(void)
 #ifdef CONFIG_SMP
 static char timer_name[NR_CPUS][15];
 
-void local_setup_timer(unsigned int cpu)
+int local_setup_timer(unsigned int cpu)
 {
-	int seq;
+	int seq, irq;
 
 	BUG_ON(cpu == 0);
 
@@ -1072,15 +1072,17 @@ void local_setup_timer(unsigned int cpu)
 	} while (read_seqretry(&xtime_lock, seq));
 
 	sprintf(timer_name[cpu], "timer%d", cpu);
-	per_cpu(timer_irq, cpu) =
-		bind_virq_to_irqhandler(
-			VIRQ_TIMER,
-			cpu,
-			timer_interrupt,
-			SA_INTERRUPT,
-			timer_name[cpu],
-			NULL);
-	BUG_ON(per_cpu(timer_irq, cpu) < 0);
+	irq = bind_virq_to_irqhandler(VIRQ_TIMER,
+				      cpu,
+				      timer_interrupt,
+				      SA_INTERRUPT,
+				      timer_name[cpu],
+				      NULL);
+	if (irq < 0)
+		return irq;
+	per_cpu(timer_irq, cpu) = irq;
+
+	return 0;
 }
 
 void local_teardown_timer(unsigned int cpu)
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
index b4bb7e6..3e7c50f 100644
--- a/drivers/xen/blkback/interface.c
+++ b/drivers/xen/blkback/interface.c
@@ -153,6 +153,15 @@ int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
 	blkif->irq = bind_evtchn_to_irqhandler(
 		blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
 
+        if (blkif->irq < 0) {
+                err = blkif->irq;
+                blkif->irq = 0;
+ 		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+       }
+
 	return 0;
 }
 
diff --git a/drivers/xen/blktap/interface.c b/drivers/xen/blktap/interface.c
index c7731a3..ac6ed6c 100644
--- a/drivers/xen/blktap/interface.c
+++ b/drivers/xen/blktap/interface.c
@@ -154,6 +154,15 @@ int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
 	blkif->irq = bind_evtchn_to_irqhandler(
 		blkif->evtchn, tap_blkif_be_int, 0, "blkif-backend", blkif);
 
+        if (blkif->irq < 0) {
+                err = blkif->irq;
+                blkif->irq = 0;
+ 		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+       }
+
 	return 0;
 }
 
diff --git a/drivers/xen/core/evtchn.c b/drivers/xen/core/evtchn.c
index 9a860b2..5217649 100644
--- a/drivers/xen/core/evtchn.c
+++ b/drivers/xen/core/evtchn.c
@@ -300,8 +300,15 @@ static int find_unbound_irq(void)
 		if (irq_bindcount[irq] == 0)
 			break;
 
-	if (irq == NR_IRQS)
-		panic("No available IRQ to bind to: increase NR_IRQS!\n");
+	if (irq == NR_IRQS) {
+		static int warned;
+		if (!warned) {
+			warned = 1;
+			printk(KERN_WARNING "No available IRQ to bind to: "
+			       "increase NR_IRQS!\n");
+		}
+		return -ENOSPC;
+	}
 
 	return irq;
 }
@@ -313,15 +320,17 @@ static int bind_evtchn_to_irq(unsigned int evtchn)
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((irq = evtchn_to_irq[evtchn]) == -1) {
-		irq = find_unbound_irq();
+		if ((irq = find_unbound_irq()) < 0)
+			goto out;
+
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
 	}
 
 	irq_bindcount[irq]++;
 
+ out:
 	spin_unlock(&irq_mapping_update_lock);
-
 	return irq;
 }
 
@@ -333,6 +342,9 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+		if ((irq = find_unbound_irq()) < 0)
+			goto out;
+
 		bind_virq.virq = virq;
 		bind_virq.vcpu = cpu;
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
@@ -340,7 +352,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 			BUG();
 		evtchn = bind_virq.port;
 
-		irq = find_unbound_irq();
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
 
@@ -351,8 +362,8 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 
 	irq_bindcount[irq]++;
 
+ out:
 	spin_unlock(&irq_mapping_update_lock);
-
 	return irq;
 }
 
@@ -364,13 +375,15 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
+		if ((irq = find_unbound_irq()) < 0)
+			goto out;
+
 		bind_ipi.vcpu = cpu;
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
 						&bind_ipi) != 0)
 			BUG();
 		evtchn = bind_ipi.port;
 
-		irq = find_unbound_irq();
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
 
@@ -381,8 +394,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 
 	irq_bindcount[irq]++;
 
+ out:
 	spin_unlock(&irq_mapping_update_lock);
-
 	return irq;
 }
 
@@ -432,6 +445,9 @@ int bind_evtchn_to_irqhandler(
 	int retval;
 
 	irq = bind_evtchn_to_irq(evtchn);
+	if (irq < 0)
+		return irq;
+
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
@@ -454,6 +470,9 @@ int bind_virq_to_irqhandler(
 	int retval;
 
 	irq = bind_virq_to_irq(virq, cpu);
+	if (irq < 0)
+		return irq;
+
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
@@ -476,6 +495,9 @@ int bind_ipi_to_irqhandler(
 	int retval;
 
 	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		return irq;
+
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
diff --git a/drivers/xen/core/smpboot.c b/drivers/xen/core/smpboot.c
index 99cdbf1..5f43010 100644
--- a/drivers/xen/core/smpboot.c
+++ b/drivers/xen/core/smpboot.c
@@ -28,7 +28,7 @@
 extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
 extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
 
-extern void local_setup_timer(unsigned int cpu);
+extern int local_setup_timer(unsigned int cpu);
 extern void local_teardown_timer(unsigned int cpu);
 
 extern void hypervisor_callback(void);
@@ -107,32 +107,45 @@ set_cpu_sibling_map(int cpu)
 	cpu_data[cpu].booted_cores = 1;
 }
 
-static void xen_smp_intr_init(unsigned int cpu)
+static int xen_smp_intr_init(unsigned int cpu)
 {
+	int rc;
+
+	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
 	sprintf(resched_name[cpu], "resched%d", cpu);
-	per_cpu(resched_irq, cpu) =
-		bind_ipi_to_irqhandler(
-			RESCHEDULE_VECTOR,
-			cpu,
-			smp_reschedule_interrupt,
-			SA_INTERRUPT,
-			resched_name[cpu],
-			NULL);
-	BUG_ON(per_cpu(resched_irq, cpu) < 0);
+	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
+				    cpu,
+				    smp_reschedule_interrupt,
+				    SA_INTERRUPT,
+				    resched_name[cpu],
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(resched_irq, cpu) = rc;
 
 	sprintf(callfunc_name[cpu], "callfunc%d", cpu);
-	per_cpu(callfunc_irq, cpu) =
-		bind_ipi_to_irqhandler(
-			CALL_FUNCTION_VECTOR,
-			cpu,
-			smp_call_function_interrupt,
-			SA_INTERRUPT,
-			callfunc_name[cpu],
-			NULL);
-	BUG_ON(per_cpu(callfunc_irq, cpu) < 0);
+	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
+				    cpu,
+				    smp_call_function_interrupt,
+				    SA_INTERRUPT,
+				    callfunc_name[cpu],
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(callfunc_irq, cpu) = rc;
+
+	if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
+		goto fail;
 
-	if (cpu != 0)
-		local_setup_timer(cpu);
+	return 0;
+
+ fail:
+	if (per_cpu(resched_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+	if (per_cpu(callfunc_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	return rc;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -257,7 +270,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 	set_cpu_sibling_map(0);
 
-	xen_smp_intr_init(0);
+	if (xen_smp_intr_init(0))
+		BUG();
 
 	cpu_initialized_map = cpumask_of_cpu(0);
 
@@ -419,7 +433,13 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	set_cpu_sibling_map(cpu);
 	wmb();
 
-	xen_smp_intr_init(cpu);
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc) {
+		remove_siblinginfo(cpu);
+		return rc;
+	}
+
 	cpu_set(cpu, cpu_online_map);
 
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
index 04f5796..4b85c54 100644
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -289,6 +289,11 @@ int netif_map(netif_t *netif, unsigned long tx_ring_ref,
 
 	netif->irq = bind_evtchn_to_irqhandler(
 		netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
+        if (netif->irq < 0) {
+                netif->irq = 0;
+                goto err_hypervisor;
+        }
+
 	disable_irq(netif->irq);
 
 	txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index bf2c6e7..1468fb8 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -1508,13 +1508,12 @@ static int __init netback_init(void)
 	netif_xenbus_init();
 
 #ifdef NETBE_DEBUG_INTERRUPT
-	(void)bind_virq_to_irqhandler(
-		VIRQ_DEBUG,
-		0,
-		netif_be_dbg,
-		SA_SHIRQ, 
-		"net-be-dbg",
-		&netif_be_dbg);
+	(void)bind_virq_to_irqhandler(VIRQ_DEBUG,
+				      0,
+				      netif_be_dbg,
+				      SA_SHIRQ, 
+				      "net-be-dbg",
+				      &netif_be_dbg);
 #endif
 
 	return 0;
diff --git a/include/asm-x86_64/mach-xen/irq_vectors.h b/include/asm-x86_64/mach-xen/irq_vectors.h
index c7d85b1..4b4761c 100644
--- a/include/asm-x86_64/mach-xen/irq_vectors.h
+++ b/include/asm-x86_64/mach-xen/irq_vectors.h
@@ -109,7 +109,11 @@
 #define NR_PIRQS		256
 
 #define DYNIRQ_BASE		(PIRQ_BASE + NR_PIRQS)
+#ifdef __GENKSYMS__
 #define NR_DYNIRQS		256
+#else /* __GENKSYMS__ */
+#define NR_DYNIRQS		1024
+#endif /* __GENKSYMS__ */
 
 #define NR_IRQS			(NR_PIRQS + NR_DYNIRQS)
 #define NR_IRQ_VECTORS		NR_IRQS