Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > media > main-src > by-pkgid > d0a35cd31c1125e2132804d68547073d > files > 4502

kernel-2.6.18-194.26.1.el5.src.rpm

From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 8 Apr 2009 10:47:28 +0200
Subject: [xen] fix evtchn exhaustion with 32-bit HVM guest
Message-id: 49DC64A0.7040109@redhat.com
O-Subject: [RHEL5.4 PATCH]: Fix evtchn exhaustion with 32-bit HVM guest
Bugzilla: 489274
RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Don Dutile <ddutile@redhat.com>

All,
     When running the PV-on-HVM drivers in a 32-bit fully virtualized guest, you
currently cannot attach more that 16 disks to the guest.  This is because the
hypervisor wasn't properly keeping track of the "compat" status of the guest.
In turn, this led the hypervisor to exhaust event channels much more quickly
than it should have.
     The solution is to track the COMPAT status of the domain, and allocate
event channels accordingly.  With this patch in place, I was able to attach 50
disks to a 32-bit PV-on-HVM guest.  This patch is a backport of upstream
xen-unstable c/s 18266 and 19266.
     This should resolve BZ 489274.  Please review and ACK.

--
Chris Lalancette

diff --git a/arch/x86/domain.c b/arch/x86/domain.c
index 562a9e2..bee2dfb 100644
--- a/arch/x86/domain.c
+++ b/arch/x86/domain.c
@@ -876,7 +876,7 @@ map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
      * lost.  The domain will get a spurious event, but it can cope.
      */
     vcpu_info(v, evtchn_upcall_pending) = 1;
-    for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
+    for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
         set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
 
     /*
diff --git a/arch/x86/domain_build.c b/arch/x86/domain_build.c
index 59141cc..40e6e93 100644
--- a/arch/x86/domain_build.c
+++ b/arch/x86/domain_build.c
@@ -781,12 +781,8 @@ int __init construct_dom0(
 
     if ( opt_dom0_max_vcpus == 0 )
         opt_dom0_max_vcpus = num_online_cpus();
-    if ( opt_dom0_max_vcpus > num_online_cpus() )
-        opt_dom0_max_vcpus = num_online_cpus();
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
-    if ( opt_dom0_max_vcpus > BITS_PER_GUEST_LONG(d) )
-        opt_dom0_max_vcpus = BITS_PER_GUEST_LONG(d);
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
     /*
@@ -800,7 +796,7 @@ int __init construct_dom0(
     }
 
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i);
+        (void)alloc_vcpu(d, i, i % num_online_cpus());
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(v->domain) )
diff --git a/arch/x86/domctl.c b/arch/x86/domctl.c
index fd04c0b..412b26e 100644
--- a/arch/x86/domctl.c
+++ b/arch/x86/domctl.c
@@ -423,7 +423,8 @@ long arch_do_domctl(
         if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
             break;
 
-        domctl->u.address_size.size = BITS_PER_GUEST_LONG(d);
+        domctl->u.address_size.size =
+            is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
 
         ret = 0;
         rcu_unlock_domain(d);
diff --git a/arch/x86/irq.c b/arch/x86/irq.c
index afdcf86..d1bc8a0 100644
--- a/arch/x86/irq.c
+++ b/arch/x86/irq.c
@@ -964,7 +964,8 @@ static void dump_irqs(unsigned char key)
                        (test_bit(d->pirq_to_evtchn[irq],
                                  shared_info_addr(d, evtchn_pending)) ?
                         'P' : '-'),
-                       (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_GUEST_LONG(d),
+                       (test_bit(d->pirq_to_evtchn[irq] /
+				 BITS_PER_EVTCHN_WORD(d),
                                  vcpu_info_addr(d->vcpu[0], evtchn_pending_sel)) ?
                         'S' : '-'),
                        (test_bit(d->pirq_to_evtchn[irq],
diff --git a/arch/x86/mm.c b/arch/x86/mm.c
index fe71d1f..1d49d4a 100644
--- a/arch/x86/mm.c
+++ b/arch/x86/mm.c
@@ -1994,6 +1994,12 @@ static inline cpumask_t vcpumask_to_pcpumask(
     cpumask_t    pmask = CPU_MASK_NONE;
     struct vcpu *v;
 
+    /*
+     * Callers copy only a single guest-sized longword from the guest.
+     * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
+     */
+    BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
+
     while ( vmask != 0 )
     {
         vcpu_id = find_first_set_bit(vmask);
diff --git a/common/domctl.c b/common/domctl.c
index 2d3270f..5a61bfc 100644
--- a/common/domctl.c
+++ b/common/domctl.c
@@ -228,13 +228,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL )
             goto svc_out;
 
-        if ( !IS_COMPAT(v->domain) )
-            ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
 #ifdef CONFIG_COMPAT
+        if ( !is_pv_32on64_vcpu(v) )
+            ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
         else
             ret = copy_from_guest(c.cmp,
                                   guest_handle_cast(op->u.vcpucontext.ctxt,
                                                     void), 1);
+#else
+        ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
 #endif
         ret = ret ? -EFAULT : 0;
 
@@ -530,12 +532,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         if ( v != current )
             vcpu_unpause(v);
 
-        if ( !IS_COMPAT(v->domain) )
-            ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
 #ifdef CONFIG_COMPAT
+        if ( !is_pv_32on64_vcpu(v) )
+            ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
         else
             ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt,
                                                   void), c.cmp, 1);
+#else
+        ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
 #endif
 
         if ( copy_to_guest(u_domctl, op, 1) || ret )
diff --git a/common/event_channel.c b/common/event_channel.c
index 4407001..142aff0 100644
--- a/common/event_channel.c
+++ b/common/event_channel.c
@@ -517,7 +517,7 @@ void evtchn_set_pending(struct vcpu *v, int port)
         return;
 
     if ( !test_bit        (port, __shared_info_addr(d, s, evtchn_mask)) &&
-         !test_and_set_bit(port / BITS_PER_GUEST_LONG(d),
+         !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d),
                            vcpu_info_addr(v, evtchn_pending_sel)) )
     {
         vcpu_mark_events_pending(v);
@@ -726,7 +726,7 @@ static long evtchn_unmask(evtchn_unmask_t *unmask)
      */
     if ( test_and_clear_bit(port, __shared_info_addr(d, s, evtchn_mask)) &&
          test_bit          (port, __shared_info_addr(d, s, evtchn_pending)) &&
-         !test_and_set_bit (port / BITS_PER_GUEST_LONG(d),
+         !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d),
                             vcpu_info_addr(v, evtchn_pending_sel)) )
     {
         vcpu_mark_events_pending(v);
diff --git a/common/keyhandler.c b/common/keyhandler.c
index e6e4d29..0861f85 100644
--- a/common/keyhandler.c
+++ b/common/keyhandler.c
@@ -205,7 +205,7 @@ static void dump_domains(unsigned char key)
                    test_bit(v->virq_to_evtchn[VIRQ_DEBUG], 
                             shared_info_addr(d, evtchn_mask)),
                    test_bit(v->virq_to_evtchn[VIRQ_DEBUG] /
-                            BITS_PER_GUEST_LONG(d),
+                            BITS_PER_EVTCHN_WORD(d),
                             vcpu_info_addr(v, evtchn_pending_sel)));
             send_guest_vcpu_virq(v, VIRQ_DEBUG);
         }
diff --git a/common/trace.c b/common/trace.c
index 5981f49..87a12fc 100644
--- a/common/trace.c
+++ b/common/trace.c
@@ -37,7 +37,7 @@
 #define xen_t_buf t_buf
 CHECK_t_buf;
 #undef xen_t_buf
-#define TB_COMPAT IS_COMPAT(dom0)
+#define TB_COMPAT is_pv_32on64_domain(dom0)
 #else
 #define compat_t_rec t_rec
 #define TB_COMPAT 0
diff --git a/common/xenoprof.c b/common/xenoprof.c
index e25ad7b..86e30f0 100644
--- a/common/xenoprof.c
+++ b/common/xenoprof.c
@@ -171,7 +171,7 @@ static int alloc_xenoprof_struct(
     bufsize = sizeof(struct xenoprof_buf);
     i = sizeof(struct event_log);
 #ifdef CONFIG_COMPAT
-    d->xenoprof->is_compat = IS_COMPAT(is_passive ? dom0 : d);
+    d->xenoprof->is_compat = is_pv_32on64_domain(is_passive ? dom0 : d);
     if ( XENOPROF_COMPAT(d->xenoprof) )
     {
         bufsize = sizeof(struct compat_oprof_buf);
diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h
index d13d1b6..c3784d1 100644
--- a/include/asm-x86/domain.h
+++ b/include/asm-x86/domain.h
@@ -16,7 +16,6 @@
 #define is_pv_32on64_domain(d) (0)
 #endif
 #define is_pv_32on64_vcpu(v)   (is_pv_32on64_domain((v)->domain))
-#define IS_COMPAT(d)           (is_pv_32on64_domain(d))
 
 struct trap_bounce {
     uint32_t      error_code;
diff --git a/include/xen/compat.h b/include/xen/compat.h
index 205f843..20f3f88 100644
--- a/include/xen/compat.h
+++ b/include/xen/compat.h
@@ -176,15 +176,10 @@ void xlat_vcpu_runstate_info(struct vcpu_runstate_info *);
 int switch_compat(struct domain *);
 int switch_native(struct domain *);
 
-#define BITS_PER_GUEST_LONG(d) \
-    (!IS_COMPAT(d) ? BITS_PER_LONG : COMPAT_BITS_PER_LONG)
-
 #else
 
 #define compat_handle_is_null(hnd) 0
 
-#define BITS_PER_GUEST_LONG(d) BITS_PER_LONG
-
 #endif
 
 #endif /* __XEN_COMPAT_H__ */
diff --git a/include/xen/sched.h b/include/xen/sched.h
index f97687f..f3f36e8 100644
--- a/include/xen/sched.h
+++ b/include/xen/sched.h
@@ -31,12 +31,11 @@ extern unsigned long volatile jiffies;
 extern struct domain *dom0;
 
 #ifndef CONFIG_COMPAT
-#define MAX_EVTCHNS(d)     NR_EVENT_CHANNELS
+#define BITS_PER_EVTCHN_WORD(d) BITS_PER_LONG
 #else
-#define MAX_EVTCHNS(d)     (!IS_COMPAT(d) ? \
-                            NR_EVENT_CHANNELS : \
-                            sizeof(unsigned int) * sizeof(unsigned int) * 64)
+#define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG)
 #endif
+#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d) * 64)
 #define EVTCHNS_PER_BUCKET 128
 #define NR_EVTCHN_BUCKETS  (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
 
@@ -493,10 +492,6 @@ static inline void vcpu_unblock(struct vcpu *v)
 
 #define IS_PRIV(_d) ((_d)->is_privileged)
 
-#ifndef IS_COMPAT
-#define IS_COMPAT(d) 0
-#endif
-
 #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist))
 
 #define is_hvm_domain(d) ((d)->is_hvm)