Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 2853

kernel-2.6.18-128.1.10.el5.src.rpm

From: Bill Burns <bburns@redhat.com>
Date: Thu, 28 Aug 2008 13:12:38 -0400
Subject: [xen] Intel EPT Migration patch
Message-id: 20080828171239.10349.46795.sendpatchset@localhost.localdomain
O-Subject: [RHEL5.3 PATCH 3/4 v3] Xen Intel EPT Migration patch
Bugzilla: 426679
RH-Acked-by: Chris Lalancette <clalance@redhat.com>
RH-Acked-by: Don Dutile <ddutile@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>

Fixes bz 426679

This patch allows live migration to work with EPT.

Intel Extended Page Table (EPT) support.

changeset: x86, vmx: Enable EPT (Extended PageTable) support on new Intel proces
sors.
changeset 17404:        9b635405ef90
parent 17403:   e1962ac0fb1c
child 17405:    32e3c81ada56
author:         Keir Fraser <keir.fraser@citrix.com>
date:   Wed Apr 09 11:30:32 2008 +0100 (4 months ago)
files:  tools/libxc/xc_hvm_build.c xen/arch/x86/domain.c xen/arch/x86/hvm/hvm.c
xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/mm.c xen/arc
h/x86/mm/hap/Makefile xen/arch/x86/mm/hap/p2m-ept.c xen/arch/x86/mm/p2m.c xen/ar
ch/x86/mm/paging.c xen/common/domctl.c xen/drivers/passthrough/vtd/iommu.c xen/i
nclude/asm-x86/domain.h xen/include/asm-x86/hvm/domain.h xen/include/asm-x86/hvm
/svm/vmcb.h xen/include/asm-x86/hvm/vmx/vmcs.h xen/include/asm-x86/hvm/vmx/vmx.h
 xen/include/asm-x86/p2m.h xen/include/asm-x86/paging.h xen/include/public/hvm/p
arams.h xen/include/xen/hypercall.h
description:    x86, vmx: Enable EPT (Extended PageTable) support on new Intel p
rocessors.
We use the EPT page table as P2M (guest physical to machine
mapping), removing the linear page table when EPT is used for the
domain (see the new file p2m-ept.c). We did this by adding three
operations in the p2m_domain. If VT-d is enabled, the EPT page table
will be used as the VT-d page table as well (i.e. shared).

Signed-off-by: Xin Li <xin.b.li@intel.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xiaohui Xin <Xiaohui.xin@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>

diff --git a/arch/x86/hvm/vmx/vmx.c b/arch/x86/hvm/vmx/vmx.c
index ed108da..0348050 100644
--- a/arch/x86/hvm/vmx/vmx.c
+++ b/arch/x86/hvm/vmx/vmx.c
@@ -3035,9 +3035,54 @@ static void vmx_reflect_exception(struct vcpu *v)
 
 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
 {
-    if ( unlikely(((qualification >> 7) & 0x3) != 0x3) )
+    unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
+    struct domain *d = current->domain;
+    u64 gfn = gpa >> PAGE_SHIFT;
+    mfn_t mfn;
+    p2m_type_t t;
+
+    /* GPA exceeds GAW. */
+    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
+    {
+        printk("EPT violation: guest physical address %"PRIpaddr" exceeded "
+               "its width limit.\n", gpa);
+        domain_crash(d);
+    }
+
+    /* The validity of the guest-linear adddress field has 4 values:
+     * 00   -   EPT_GLA_VALIDITY_PDPTR_LOAD
+     * 01   -   EPT_GLA_VALIDITY_GPT_WALK
+     * 10   -   EPT_GLA_VALIDITY_RSVD
+     * 11   -   EPT_GLA_VALIDITY_MATCH
+     *
+     * 11 is the normal case, and 01 also contains the situations
+     * No-write EPT page encounted when trying to write an A or D
+     * bits. When we in log-dirty mode, it may occurs.
+     */
+
+    if ( gla_validity == EPT_GLA_VALIDITY_RSVD ||
+         gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD )
     {
-        domain_crash(current->domain);
+        printk("ept violation: reserved bit or pdptr load violation.\n");
+        domain_crash(d);
+    }
+
+    mfn = ept_get_entry(d, gfn, &t);
+
+    if ( unlikely( gla_validity != EPT_GLA_VALIDITY_MATCH) )
+    {
+        if ( !p2m_is_ram(t) || !paging_mode_log_dirty(d) )
+        {
+            domain_crash(d);
+            return;
+        }
+    }
+
+    if ( p2m_is_ram(t) && paging_mode_log_dirty(d) )
+    {
+        paging_mark_dirty(d, mfn_x(mfn));
+        p2m_set_flags(d, gpa, __PAGE_HYPERVISOR|_PAGE_PSE);
+        flush_tlb_mask(d->domain_dirty_cpumask);
         return;
     }
     /* must be MMIO */
diff --git a/arch/x86/mm/hap/hap.c b/arch/x86/mm/hap/hap.c
index 5316fbd..a540a6e 100644
--- a/arch/x86/mm/hap/hap.c
+++ b/arch/x86/mm/hap/hap.c
@@ -61,7 +61,7 @@ int hap_enable_log_dirty(struct domain *d)
     hap_unlock(d);
 
     /* set l1e entries of P2M table to NOT_WRITABLE. */
-    p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER));
+    p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER));
     flush_tlb_mask(d->domain_dirty_cpumask);
     return 0;
 }
@@ -73,14 +73,14 @@ int hap_disable_log_dirty(struct domain *d)
     hap_unlock(d);
 
     /* set l1e entries of P2M table with normal mode */
-    p2m_set_flags_global(d, __PAGE_HYPERVISOR|_PAGE_USER);    
+    p2m_change_entry_type_global(d, (__PAGE_HYPERVISOR|_PAGE_USER));
     return 0;
 }
 
 void hap_clean_dirty_bitmap(struct domain *d)
 {
     /* mark physical memory as NOT_WRITEABLE and flush the TLB */
-    p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER));
+    p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER));
     flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
diff --git a/arch/x86/mm/p2m-ept.c b/arch/x86/mm/p2m-ept.c
index 10921de..cceb9f3 100644
--- a/arch/x86/mm/p2m-ept.c
+++ b/arch/x86/mm/p2m-ept.c
@@ -30,10 +30,21 @@ static int ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_typ
 mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t);
 static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t);
 
-static inline int 
-compat_ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 l1e_flags)
+static p2m_type_t ept_flags_to_p2m_type(u32 l1e_flags)
 {
-    return ept_set_entry(d, gfn, mfn, p2m_ram_rw);
+    if ( l1e_flags & _PAGE_RW )
+        return p2m_ram_rw;
+    else if ( paging_mode_log_dirty(current->domain) )
+        return p2m_ram_logdirty;
+    return p2m_invalid;
+}
+
+static inline int
+compat_ept_set_entry(struct domain *d, unsigned long gfn,
+  mfn_t mfn, u32 l1e_flags)
+{
+    p2m_type_t t = ept_flags_to_p2m_type(l1e_flags);
+    return ept_set_entry(d, gfn, mfn,  t);
 }
 
 static mfn_t compat_ept_get_entry(struct domain *d, unsigned long gfn)
@@ -55,6 +66,26 @@ static mfn_t compat_ept_get_entry_fast(unsigned long gfn)
 
 #endif
 
+static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type)
+{
+    switch(type)
+    {
+        case p2m_ram_rw:
+        case p2m_mmio_direct:
+             entry->r = entry->w = entry->x = 1;
+            return;
+        case p2m_ram_logdirty:
+        case p2m_ram_ro:
+             entry->r = entry->x = 1;
+             entry->w = 0;
+            return;
+        case p2m_invalid:
+        case p2m_mmio_dm:
+        default:
+            return;
+    }
+}
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -131,6 +162,7 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
         ept_entry->avail2 = 0;
         /* last step */
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
+        ept_p2m_type_to_flags(ept_entry, p2mt);
     }
     else
         ept_entry->epte = 0;
@@ -171,13 +203,10 @@ mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
     index = gfn_remainder;
     ept_entry = table + index;
 
-    if ( (ept_entry->epte & 0x7) == 0x7 )
+    if ( ept_entry->avail1 != p2m_invalid )
     {
-        if ( ept_entry->avail1 != p2m_invalid )
-        {
-            *t = ept_entry->avail1;
-            mfn = _mfn(ept_entry->mfn);
-        }
+        *t = ept_entry->avail1;
+        mfn = _mfn(ept_entry->mfn);
     }
 
  out:
@@ -190,11 +219,117 @@ static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t)
     return ept_get_entry(current->domain, gfn, t);
 }
 
+/* Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to
+ * quickly enable or diable log-dirty tracking */
+
+static void ept_change_entry_type_global(struct domain *d,
+                                         p2m_type_t ot, p2m_type_t nt)
+{
+    ept_entry_t *l4e, *l3e, *l2e, *l1e;
+    int i4, i3, i2, i1;
+
+    if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
+        return;
+
+    BUG_ON(EPT_DEFAULT_GAW != 3);
+
+    l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
+    {
+        if ( !l4e[i4].epte )
+            continue;
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
+                continue;
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
+    }
+    unmap_domain_page(l4e);
+
+    ept_sync_all();
+}
+
+static void __ept_change_entry_type_global(struct domain *d,
+                                         u32 l1e_flags)
+{
+    p2m_type_t nt,ot;
+
+    if ( l1e_flags  == (__PAGE_HYPERVISOR|_PAGE_USER) )
+    {
+        nt = p2m_ram_rw;
+        ot = p2m_ram_logdirty;
+    }
+    else if ( l1e_flags == (_PAGE_PRESENT|_PAGE_USER) )
+    {
+        nt = p2m_ram_logdirty;
+        ot = p2m_ram_rw;
+    }
+    else
+    {
+        nt = ot = p2m_ram_rw;
+        BUG();
+    }
+
+    ept_change_entry_type_global(d, ot, nt);
+}
+
 void ept_p2m_init(struct domain *d)
 {
     d->arch.p2m.set_entry = compat_ept_set_entry;
     d->arch.p2m.get_entry = compat_ept_get_entry;
     d->arch.p2m.get_entry_fast = compat_ept_get_entry_fast;
+    d->arch.p2m.change_entry_type_global = __ept_change_entry_type_global;
 }
 
 /*
diff --git a/arch/x86/mm/p2m.c b/arch/x86/mm/p2m.c
index 1205840..fb28701 100644
--- a/arch/x86/mm/p2m.c
+++ b/arch/x86/mm/p2m.c
@@ -48,6 +48,9 @@
         (_d)->arch.p2m.locker_function = "nobody";   \
     } while (0)
 
+#define p2m_locked_by_me(_d)                     \
+    (current->processor == (_d)->arch.p2m.locker)
+
 #define p2m_lock(_d)                                                \
     do {                                                            \
         if ( unlikely((_d)->arch.p2m.locker == current->processor) )\
@@ -315,12 +318,20 @@ void p2m_init(struct domain *d)
     d->arch.p2m.set_entry = p2m_set_entry;
     d->arch.p2m.get_entry = p2m_gfn_to_mfn;
     d->arch.p2m.get_entry_fast = p2m_gfn_to_mfn_fast;
+    d->arch.p2m.change_entry_type_global = p2m_set_flags_global;
 
     if ( is_hvm_domain(d) && hap_enabled(d) &&
          (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
         ept_p2m_init(d);
 }
 
+void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags)
+{
+    p2m_lock(d);
+    d->arch.p2m.change_entry_type_global(d, l1e_flags);
+    p2m_unlock(d);
+}
+
 static inline
 int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 l1e_flags)
 {
@@ -882,8 +893,8 @@ void p2m_set_flags_global(struct domain *d, u32 l1e_flags)
     if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
         return;
 
-    p2m_lock(d);
-        
+    ASSERT(p2m_locked_by_me(d));
+
 #if CONFIG_PAGING_LEVELS == 4
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
 #elif CONFIG_PAGING_LEVELS == 3
@@ -952,7 +963,6 @@ void p2m_set_flags_global(struct domain *d, u32 l1e_flags)
     unmap_domain_page(l2e);
 #endif
 
-    p2m_unlock(d);
 }
 
 /* This function traces through P2M table and modifies l1e flags of a specific
diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h
index 53028d2..50f6245 100644
--- a/include/asm-x86/domain.h
+++ b/include/asm-x86/domain.h
@@ -144,6 +144,9 @@ struct p2m_domain {
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn);
     mfn_t              (*get_entry_fast)(unsigned long gfn);
 
+    void               (*change_entry_type_global)(struct domain *d,
+                                                    u32 l1e_flags);
+
     /* Highest guest frame that's ever been mapped in the p2m */
     unsigned long max_mapped_pfn;
 };
diff --git a/include/asm-x86/hvm/vmx/vmx.h b/include/asm-x86/hvm/vmx/vmx.h
index dc61578..b80d0fa 100644
--- a/include/asm-x86/hvm/vmx/vmx.h
+++ b/include/asm-x86/hvm/vmx/vmx.h
@@ -47,6 +47,8 @@ typedef union {
 
 #define EPT_TABLE_ORDER     9
 
+extern mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t);
+
 void vmx_asm_vmexit_handler(struct cpu_user_regs);
 void vmx_asm_do_vmentry(void);
 void vmx_intr_assist(void);
@@ -363,4 +365,47 @@ static inline void vmx_inject_extint(struct vcpu *v, int trap, int error_code)
 
 void ept_p2m_init(struct domain *d);
 
+/* EPT violation qualifications definitions */
+/* bit offset 0 in exit qualification */
+#define _EPT_READ_VIOLATION         0
+#define EPT_READ_VIOLATION          (1UL<<_EPT_READ_VIOLATION)
+/* bit offset 1 in exit qualification */
+#define _EPT_WRITE_VIOLATION        1
+#define EPT_WRITE_VIOLATION         (1UL<<_EPT_WRITE_VIOLATION)
+/* bit offset 2 in exit qualification */
+#define _EPT_EXEC_VIOLATION         2
+#define EPT_EXEC_VIOLATION          (1UL<<_EPT_EXEC_VIOLATION)
+
+/* bit offset 3 in exit qualification */
+#define _EPT_EFFECTIVE_READ         3
+#define EPT_EFFECTIVE_READ          (1UL<<_EPT_EFFECTIVE_READ)
+/* bit offset 4 in exit qualification */
+#define _EPT_EFFECTIVE_WRITE        4
+#define EPT_EFFECTIVE_WRITE         (1UL<<_EPT_EFFECTIVE_WRITE)
+/* bit offset 5 in exit qualification */
+#define _EPT_EFFECTIVE_EXEC         5
+#define EPT_EFFECTIVE_EXEC          (1UL<<_EPT_EFFECTIVE_EXEC)
+
+/* bit offset 6 in exit qualification */
+#define _EPT_GAW_VIOLATION          6
+#define EPT_GAW_VIOLATION           (1UL<<_EPT_GAW_VIOLATION)
+
+/* bits offset 7 & 8 in exit qualification */
+#define _EPT_GLA_VALIDITY           7
+#define EPT_GLA_VALIDITY_MASK       (3UL<<_EPT_GLA_VALIDITY)
+/* gla != gpa, when load PDPTR */
+#define EPT_GLA_VALIDITY_PDPTR_LOAD (0UL<<_EPT_GLA_VALIDITY)
+/* gla != gpa, during guest page table walking */
+#define EPT_GLA_VALIDITY_GPT_WALK   (1UL<<_EPT_GLA_VALIDITY)
+/* reserved */
+#define EPT_GLA_VALIDITY_RSVD       (2UL<<_EPT_GLA_VALIDITY)
+/* gla == gpa, normal case */
+#define EPT_GLA_VALIDITY_MATCH      (3UL<<_EPT_GLA_VALIDITY)
+
+#define EPT_EFFECTIVE_MASK          (EPT_EFFECTIVE_READ  |  \
+                                     EPT_EFFECTIVE_WRITE |  \
+                                     EPT_EFFECTIVE_EXEC)
+
+#define EPT_PAGETABLE_ENTRIES       512
+
 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
diff --git a/include/asm-x86/p2m.h b/include/asm-x86/p2m.h
index 40501e0..71035d0 100644
--- a/include/asm-x86/p2m.h
+++ b/include/asm-x86/p2m.h
@@ -200,6 +200,8 @@ void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
 /* set P2M table l1e flags */
 void p2m_set_flags_global(struct domain *d, u32 l1e_flags);
 
+void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags);
+
 /* set P2M table l1e flags for a gpa */
 int p2m_set_flags(struct domain *d, paddr_t gpa, u32 l1e_flags);