Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4415

kernel-2.6.18-194.11.1.el5.src.rpm

From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 13 Aug 2008 12:13:22 +0200
Subject: [xen] avoid dom0 hang when tearing down domains
Message-id: 48A2B3C2.1040404@redhat.com
O-Subject: [RHEL5.3 PATCH]: Avoid dom0 hang when tearing down domains
Bugzilla: 347161
RH-Acked-by: Rik van Riel <riel@redhat.com>

Currently if you have a large memory machine (say 16GB or greater) and start a
large PV domain (12GB or larger), shutting down that domain will cause a pause
in the dom0 for some variable number of seconds (depending on how big the guest
is).  These hangs are caused by the hypervisor code that tears down the
pagetables for the guest.  There are 2 problems:

1)  The act of tearing down the page tables is completely synchronous, so during
the time that this is happening, nothing else can be scheduled on that CPU.

2)  The act of scrubbing the pages runs into lots of lock contention among the
CPUs in a system, causing further slowdown.

The first problem is addressed by the attached patch, by basically making the
code that tears down the page tables pre-emptible.  The second problem is
addressed by Tetsu Yamamoto's recently posted patch series for ia64, with the
title "[PATCH 0/7] xen: fix soft lockup on creating a domain with large memory".
 With Tetsu's patchset in place, and this additional patch in place, the hangs
are completely avoided.

Tested by me on x86_64.  Before this patch, doing "xm shutdown large_domain ;
while true ; date ; sleep 1 ; done" would show a 7-10 second delay on my 15 GB
guest.  After this patch, no delay is observed when shutting down a large domain.

This patch is a combination of xen-unstable c/s 15821, 15838, 17625, 17761, and
17835.  This solves BZ 347161.

Please review and ACK.

Chris Lalancette

diff --git a/arch/x86/domain.c b/arch/x86/domain.c
index db86d09..298432c 100644
--- a/arch/x86/domain.c
+++ b/arch/x86/domain.c
@@ -51,8 +51,6 @@
 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
 DEFINE_PER_CPU(__u64, efer);
 
-static void unmap_vcpu_info(struct vcpu *v);
-
 static void paravirt_ctxt_switch_from(struct vcpu *v);
 static void paravirt_ctxt_switch_to(struct vcpu *v);
 
@@ -400,8 +398,6 @@ void vcpu_destroy(struct vcpu *v)
     if ( is_pv_32on64_vcpu(v) )
         release_compat_l4(v);
 
-    unmap_vcpu_info(v);
-
     if ( is_hvm_vcpu(v) )
         hvm_vcpu_destroy(v);
 }
@@ -416,6 +412,8 @@ int arch_domain_create(struct domain *d)
     int vcpuid, pdpt_order;
     int rc = -ENOMEM;
 
+    d->arch.relmem = RELMEM_not_started;
+    INIT_LIST_HEAD(&d->arch.relmem_list);
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
     d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
     if ( d->arch.mm_perdomain_pt == NULL )
@@ -1606,12 +1604,13 @@ int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
 }
 #endif
 
-static void relinquish_memory(struct domain *d, struct list_head *list,
+static int relinquish_memory(struct domain *d, struct list_head *list,
                               unsigned long type)
 {
     struct list_head *ent;
     struct page_info  *page;
     unsigned long     x, y;
+    int               ret = 0;
 
     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
     spin_lock_recursive(&d->page_alloc_lock);
@@ -1626,6 +1625,7 @@ static void relinquish_memory(struct domain *d, struct list_head *list,
         {
             /* Couldn't get a reference -- someone is freeing this page. */
             ent = ent->next;
+            list_move_tail(&page->list, &d->arch.relmem_list);
             continue;
         }
 
@@ -1635,6 +1635,27 @@ static void relinquish_memory(struct domain *d, struct list_head *list,
         if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
             put_page(page);
 
+#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
+        /*
+         * Forcibly drop reference counts of page tables above top most (which
+         * were skipped to prevent long latencies due to deep recursion - see
+         * the special treatment in free_lX_table()).
+         */
+        y = page->u.inuse.type_info;
+        if ( (type < PGT_root_page_table) &&
+             unlikely(((y + PGT_type_mask) &
+                       (PGT_type_mask|PGT_validated)) == type) )
+        {
+            BUG_ON((y & PGT_count_mask) >=
+                   (page->count_info & PGC_count_mask));
+            while ( y & PGT_count_mask )
+            {
+                put_page_and_type(page);
+                y = page->u.inuse.type_info;
+            }
+        }
+#endif
+
         /*
          * Forcibly invalidate top-most, still valid page tables at this point
          * to break circular 'linear page table' references. This is okay
@@ -1660,10 +1681,21 @@ static void relinquish_memory(struct domain *d, struct list_head *list,
 
         /* Follow the list chain and /then/ potentially free the page. */
         ent = ent->next;
+        list_move_tail(&page->list, &d->arch.relmem_list);
         put_page(page);
+
+        if ( hypercall_preempt_check() )
+        {
+            ret = -EAGAIN;
+            goto out;
+        }
     }
 
+    list_splice_init(&d->arch.relmem_list, list);
+
+ out:
     spin_unlock_recursive(&d->page_alloc_lock);
+    return ret;
 }
 
 static void vcpu_destroy_pagetables(struct vcpu *v)
@@ -1725,35 +1757,78 @@ static void vcpu_destroy_pagetables(struct vcpu *v)
 
 int domain_relinquish_resources(struct domain *d)
 {
+    int ret;
     struct vcpu *v;
 
     BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
 
-    /* Drop the in-use references to page-table bases. */
-    for_each_vcpu ( d, v )
-        vcpu_destroy_pagetables(v);
+    switch ( d->arch.relmem )
+    {
+    case RELMEM_not_started:
+        /* Tear down paging-assistance stuff. */
+        paging_teardown(d);
 
-    /* Tear down paging-assistance stuff. */
-    paging_teardown(d);
+        for_each_vcpu ( d, v )
+        {
+            /* Drop the in-use references to page-table bases. */
+            vcpu_destroy_pagetables(v);
+      
+            /*
+             * Relinquish GDT mappings. No need for explicit unmapping of the
+             * LDT as it automatically gets squashed with the guest mappings.
+             */
+            destroy_gdt(v);
+
+            unmap_vcpu_info(v);
+        }
 
-    /*
-     * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
-     * it automatically gets squashed when the guest's mappings go away.
-     */
-    for_each_vcpu(d, v)
-        destroy_gdt(v);
+        d->arch.relmem = RELMEM_xen;
+        /* fallthrough */
 
-    /* Relinquish every page of memory. */
+        /* Relinquish every page of memory. */
+    case RELMEM_xen:
+        ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
+        if ( ret )
+            return ret;
 #if CONFIG_PAGING_LEVELS >= 4
-    relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
-    relinquish_memory(d, &d->page_list, PGT_l4_page_table);
+        d->arch.relmem = RELMEM_l4;
+        /* fallthrough */
+
+    case RELMEM_l4:
+        ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
+        if ( ret )
+            return ret;
 #endif
 #if CONFIG_PAGING_LEVELS >= 3
-    relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
-    relinquish_memory(d, &d->page_list, PGT_l3_page_table);
+        d->arch.relmem = RELMEM_l3;
+        /* fallthrough */
+
+    case RELMEM_l3:
+        ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
+        if ( ret )
+            return ret;
 #endif
-    relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
-    relinquish_memory(d, &d->page_list, PGT_l2_page_table);
+        d->arch.relmem = RELMEM_l2;
+        /* fallthrough */
+
+    case RELMEM_l2:
+        ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
+        if ( ret )
+            return ret;
+        d->arch.relmem = RELMEM_done;
+        /* fallthrough */
+
+    case RELMEM_done:
+#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
+        ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
+        if ( ret )
+            return ret;
+#endif
+        break;
+
+    default:
+        BUG();
+    }
 
     /* Free page used by xen oprofile buffer. */
     free_xenoprof_pages(d);
diff --git a/arch/x86/domctl.c b/arch/x86/domctl.c
index 36dd4a5..9cd0c71 100644
--- a/arch/x86/domctl.c
+++ b/arch/x86/domctl.c
@@ -230,10 +230,14 @@ long arch_do_domctl(
         ret = -EINVAL;
         if ( d != NULL )
         {
-            ret = 0;
-
             spin_lock(&d->page_alloc_lock);
 
+            if ( unlikely(d->is_dying) ) {
+                spin_unlock(&d->page_alloc_lock);
+                goto getmemlist_out;
+            }
+
+            ret = 0;
             list_ent = d->page_list.next;
             for ( i = 0; (i < max_pfns) && (list_ent != &d->page_list); i++ )
             {
@@ -253,6 +257,7 @@ long arch_do_domctl(
             domctl->u.getmemlist.num_pfns = i;
             copy_to_guest(u_domctl, domctl, 1);
 
+        getmemlist_out:
             rcu_unlock_domain(d);
         }
     }
diff --git a/arch/x86/mm.c b/arch/x86/mm.c
index f98ac50..557a3d9 100644
--- a/arch/x86/mm.c
+++ b/arch/x86/mm.c
@@ -1258,6 +1258,11 @@ static void free_l3_table(struct page_info *page)
     l3_pgentry_t *pl3e;
     int           i;
 
+#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
+    if ( d->arch.relmem == RELMEM_l3 )
+        return;
+#endif
+
     pl3e = map_domain_page(pfn);
 
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
@@ -1281,6 +1286,11 @@ static void free_l4_table(struct page_info *page)
     l4_pgentry_t *pl4e = page_to_virt(page);
     int           i;
 
+#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
+    if ( d->arch.relmem == RELMEM_l4 )
+        return;
+#endif
+
     for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
         if ( is_guest_l4_slot(d, i) )
             put_page_from_l4e(pl4e[i], pfn);
diff --git a/common/domain.c b/common/domain.c
index 0984cf8..33e464b 100644
--- a/common/domain.c
+++ b/common/domain.c
@@ -334,6 +334,7 @@ int domain_kill(struct domain *d)
         d->is_dying = DOMDYING_dying;
         evtchn_destroy(d);
         gnttab_release_mappings(d);
+        /* fallthrough */
     case DOMDYING_dying:
         rc = domain_relinquish_resources(d);
         page_scrub_kick();
@@ -345,6 +346,7 @@ int domain_kill(struct domain *d)
         d->is_dying = DOMDYING_dead;
         put_domain(d);
         send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+        /* fallthrough */
     case DOMDYING_dead:
         break;
     }
diff --git a/include/asm-x86/config.h b/include/asm-x86/config.h
index 9de0c03..83e118b 100644
--- a/include/asm-x86/config.h
+++ b/include/asm-x86/config.h
@@ -39,6 +39,14 @@
 
 #define CONFIG_VGA 1
 
+/*
+ * Avoid deep recursion when tearing down pagetables during domain destruction,
+ * causing dom0 to become unresponsive and Xen to miss time-critical softirq
+ * deadlines. This will ultimately be replaced by built-in preemptibility of
+ * get_page_type().
+ */
+#define DOMAIN_DESTRUCT_AVOID_RECURSION 1
+
 #define HZ 100
 
 #define OPT_CONSOLE_STR "com1,vga"
diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h
index 6e00e4d..dcf1f53 100644
--- a/include/asm-x86/domain.h
+++ b/include/asm-x86/domain.h
@@ -232,6 +232,17 @@ struct arch_domain
     bool_t is_32bit_pv;
     /* Is shared-info page in 32-bit format? */
     bool_t has_32bit_shinfo;
+
+    /* Continuable domain_relinquish_resources(). */
+    enum {
+        RELMEM_not_started,
+        RELMEM_xen,
+        RELMEM_l4,
+        RELMEM_l3,
+        RELMEM_l2,
+        RELMEM_done,
+    } relmem;
+    struct list_head relmem_list;
 } __cacheline_aligned;
 
 #ifdef CONFIG_X86_PAE