From: Bill Burns <bburns@redhat.com> Date: Thu, 28 Aug 2008 13:12:38 -0400 Subject: [xen] Intel EPT Migration patch Message-id: 20080828171239.10349.46795.sendpatchset@localhost.localdomain O-Subject: [RHEL5.3 PATCH 3/4 v3] Xen Intel EPT Migration patch Bugzilla: 426679 RH-Acked-by: Chris Lalancette <clalance@redhat.com> RH-Acked-by: Don Dutile <ddutile@redhat.com> RH-Acked-by: Rik van Riel <riel@redhat.com> Fixes bz 426679 This patch allows live migration to work with EPT. Intel Extended Page Table (EPT) support. changeset: x86, vmx: Enable EPT (Extended PageTable) support on new Intel proces sors. changeset 17404: 9b635405ef90 parent 17403: e1962ac0fb1c child 17405: 32e3c81ada56 author: Keir Fraser <keir.fraser@citrix.com> date: Wed Apr 09 11:30:32 2008 +0100 (4 months ago) files: tools/libxc/xc_hvm_build.c xen/arch/x86/domain.c xen/arch/x86/hvm/hvm.c xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/mm.c xen/arc h/x86/mm/hap/Makefile xen/arch/x86/mm/hap/p2m-ept.c xen/arch/x86/mm/p2m.c xen/ar ch/x86/mm/paging.c xen/common/domctl.c xen/drivers/passthrough/vtd/iommu.c xen/i nclude/asm-x86/domain.h xen/include/asm-x86/hvm/domain.h xen/include/asm-x86/hvm /svm/vmcb.h xen/include/asm-x86/hvm/vmx/vmcs.h xen/include/asm-x86/hvm/vmx/vmx.h xen/include/asm-x86/p2m.h xen/include/asm-x86/paging.h xen/include/public/hvm/p arams.h xen/include/xen/hypercall.h description: x86, vmx: Enable EPT (Extended PageTable) support on new Intel p rocessors. We use the EPT page table as P2M (guest physical to machine mapping), removing the linear page table when EPT is used for the domain (see the new file p2m-ept.c). We did this by adding three operations in the p2m_domain. If VT-d is enabled, the EPT page table will be used as the VT-d page table as well (i.e. shared). Signed-off-by: Xin Li <xin.b.li@intel.com> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> Signed-off-by: Xiaohui Xin <Xiaohui.xin@intel.com> Signed-off-by: Keir Fraser <keir.fraser@citrix.com> diff --git a/arch/x86/hvm/vmx/vmx.c b/arch/x86/hvm/vmx/vmx.c index ed108da..0348050 100644 --- a/arch/x86/hvm/vmx/vmx.c +++ b/arch/x86/hvm/vmx/vmx.c @@ -3035,9 +3035,54 @@ static void vmx_reflect_exception(struct vcpu *v) static void ept_handle_violation(unsigned long qualification, paddr_t gpa) { - if ( unlikely(((qualification >> 7) & 0x3) != 0x3) ) + unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK; + struct domain *d = current->domain; + u64 gfn = gpa >> PAGE_SHIFT; + mfn_t mfn; + p2m_type_t t; + + /* GPA exceeds GAW. */ + if ( unlikely(qualification & EPT_GAW_VIOLATION) ) + { + printk("EPT violation: guest physical address %"PRIpaddr" exceeded " + "its width limit.\n", gpa); + domain_crash(d); + } + + /* The validity of the guest-linear adddress field has 4 values: + * 00 - EPT_GLA_VALIDITY_PDPTR_LOAD + * 01 - EPT_GLA_VALIDITY_GPT_WALK + * 10 - EPT_GLA_VALIDITY_RSVD + * 11 - EPT_GLA_VALIDITY_MATCH + * + * 11 is the normal case, and 01 also contains the situations + * No-write EPT page encounted when trying to write an A or D + * bits. When we in log-dirty mode, it may occurs. + */ + + if ( gla_validity == EPT_GLA_VALIDITY_RSVD || + gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD ) { - domain_crash(current->domain); + printk("ept violation: reserved bit or pdptr load violation.\n"); + domain_crash(d); + } + + mfn = ept_get_entry(d, gfn, &t); + + if ( unlikely( gla_validity != EPT_GLA_VALIDITY_MATCH) ) + { + if ( !p2m_is_ram(t) || !paging_mode_log_dirty(d) ) + { + domain_crash(d); + return; + } + } + + if ( p2m_is_ram(t) && paging_mode_log_dirty(d) ) + { + paging_mark_dirty(d, mfn_x(mfn)); + p2m_set_flags(d, gpa, __PAGE_HYPERVISOR|_PAGE_PSE); + flush_tlb_mask(d->domain_dirty_cpumask); return; } /* must be MMIO */ diff --git a/arch/x86/mm/hap/hap.c b/arch/x86/mm/hap/hap.c index 5316fbd..a540a6e 100644 --- a/arch/x86/mm/hap/hap.c +++ b/arch/x86/mm/hap/hap.c @@ -61,7 +61,7 @@ int hap_enable_log_dirty(struct domain *d) hap_unlock(d); /* set l1e entries of P2M table to NOT_WRITABLE. */ - p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER)); + p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER)); flush_tlb_mask(d->domain_dirty_cpumask); return 0; } @@ -73,14 +73,14 @@ int hap_disable_log_dirty(struct domain *d) hap_unlock(d); /* set l1e entries of P2M table with normal mode */ - p2m_set_flags_global(d, __PAGE_HYPERVISOR|_PAGE_USER); + p2m_change_entry_type_global(d, (__PAGE_HYPERVISOR|_PAGE_USER)); return 0; } void hap_clean_dirty_bitmap(struct domain *d) { /* mark physical memory as NOT_WRITEABLE and flush the TLB */ - p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER)); + p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER)); flush_tlb_mask(d->domain_dirty_cpumask); } diff --git a/arch/x86/mm/p2m-ept.c b/arch/x86/mm/p2m-ept.c index 10921de..cceb9f3 100644 --- a/arch/x86/mm/p2m-ept.c +++ b/arch/x86/mm/p2m-ept.c @@ -30,10 +30,21 @@ static int ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_typ mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t); static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t); -static inline int -compat_ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 l1e_flags) +static p2m_type_t ept_flags_to_p2m_type(u32 l1e_flags) { - return ept_set_entry(d, gfn, mfn, p2m_ram_rw); + if ( l1e_flags & _PAGE_RW ) + return p2m_ram_rw; + else if ( paging_mode_log_dirty(current->domain) ) + return p2m_ram_logdirty; + return p2m_invalid; +} + +static inline int +compat_ept_set_entry(struct domain *d, unsigned long gfn, + mfn_t mfn, u32 l1e_flags) +{ + p2m_type_t t = ept_flags_to_p2m_type(l1e_flags); + return ept_set_entry(d, gfn, mfn, t); } static mfn_t compat_ept_get_entry(struct domain *d, unsigned long gfn) @@ -55,6 +66,26 @@ static mfn_t compat_ept_get_entry_fast(unsigned long gfn) #endif +static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type) +{ + switch(type) + { + case p2m_ram_rw: + case p2m_mmio_direct: + entry->r = entry->w = entry->x = 1; + return; + case p2m_ram_logdirty: + case p2m_ram_ro: + entry->r = entry->x = 1; + entry->w = 0; + return; + case p2m_invalid: + case p2m_mmio_dm: + default: + return; + } +} + static int ept_next_level(struct domain *d, bool_t read_only, ept_entry_t **table, unsigned long *gfn_remainder, u32 shift) @@ -131,6 +162,7 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) ept_entry->avail2 = 0; /* last step */ ept_entry->r = ept_entry->w = ept_entry->x = 1; + ept_p2m_type_to_flags(ept_entry, p2mt); } else ept_entry->epte = 0; @@ -171,13 +203,10 @@ mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t) index = gfn_remainder; ept_entry = table + index; - if ( (ept_entry->epte & 0x7) == 0x7 ) + if ( ept_entry->avail1 != p2m_invalid ) { - if ( ept_entry->avail1 != p2m_invalid ) - { - *t = ept_entry->avail1; - mfn = _mfn(ept_entry->mfn); - } + *t = ept_entry->avail1; + mfn = _mfn(ept_entry->mfn); } out: @@ -190,11 +219,117 @@ static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t) return ept_get_entry(current->domain, gfn, t); } +/* Walk the whole p2m table, changing any entries of the old type + * to the new type. This is used in hardware-assisted paging to + * quickly enable or diable log-dirty tracking */ + +static void ept_change_entry_type_global(struct domain *d, + p2m_type_t ot, p2m_type_t nt) +{ + ept_entry_t *l4e, *l3e, *l2e, *l1e; + int i4, i3, i2, i1; + + if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) + return; + + BUG_ON(EPT_DEFAULT_GAW != 3); + + l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ ) + { + if ( !l4e[i4].epte ) + continue; + if ( !l4e[i4].sp_avail ) + { + l3e = map_domain_page(l4e[i4].mfn); + for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ ) + { + if ( !l3e[i3].epte ) + continue; + if ( !l3e[i3].sp_avail ) + { + l2e = map_domain_page(l3e[i3].mfn); + for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ ) + { + if ( !l2e[i2].epte ) + continue; + if ( !l2e[i2].sp_avail ) + { + l1e = map_domain_page(l2e[i2].mfn); + for ( i1 = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ ) + { + if ( !l1e[i1].epte ) + continue; + if ( l1e[i1].avail1 != ot ) + continue; + l1e[i1].avail1 = nt; + ept_p2m_type_to_flags(l1e+i1, nt); + } + unmap_domain_page(l1e); + } + else + { + if ( l2e[i2].avail1 != ot ) + continue; + l2e[i2].avail1 = nt; + ept_p2m_type_to_flags(l2e+i2, nt); + } + } + unmap_domain_page(l2e); + } + else + { + if ( l3e[i3].avail1 != ot ) + continue; + l3e[i3].avail1 = nt; + ept_p2m_type_to_flags(l3e+i3, nt); + } + } + unmap_domain_page(l3e); + } + else + { + if ( l4e[i4].avail1 != ot ) + continue; + l4e[i4].avail1 = nt; + ept_p2m_type_to_flags(l4e+i4, nt); + } + } + unmap_domain_page(l4e); + + ept_sync_all(); +} + +static void __ept_change_entry_type_global(struct domain *d, + u32 l1e_flags) +{ + p2m_type_t nt,ot; + + if ( l1e_flags == (__PAGE_HYPERVISOR|_PAGE_USER) ) + { + nt = p2m_ram_rw; + ot = p2m_ram_logdirty; + } + else if ( l1e_flags == (_PAGE_PRESENT|_PAGE_USER) ) + { + nt = p2m_ram_logdirty; + ot = p2m_ram_rw; + } + else + { + nt = ot = p2m_ram_rw; + BUG(); + } + + ept_change_entry_type_global(d, ot, nt); +} + void ept_p2m_init(struct domain *d) { d->arch.p2m.set_entry = compat_ept_set_entry; d->arch.p2m.get_entry = compat_ept_get_entry; d->arch.p2m.get_entry_fast = compat_ept_get_entry_fast; + d->arch.p2m.change_entry_type_global = __ept_change_entry_type_global; } /* diff --git a/arch/x86/mm/p2m.c b/arch/x86/mm/p2m.c index 1205840..fb28701 100644 --- a/arch/x86/mm/p2m.c +++ b/arch/x86/mm/p2m.c @@ -48,6 +48,9 @@ (_d)->arch.p2m.locker_function = "nobody"; \ } while (0) +#define p2m_locked_by_me(_d) \ + (current->processor == (_d)->arch.p2m.locker) + #define p2m_lock(_d) \ do { \ if ( unlikely((_d)->arch.p2m.locker == current->processor) )\ @@ -315,12 +318,20 @@ void p2m_init(struct domain *d) d->arch.p2m.set_entry = p2m_set_entry; d->arch.p2m.get_entry = p2m_gfn_to_mfn; d->arch.p2m.get_entry_fast = p2m_gfn_to_mfn_fast; + d->arch.p2m.change_entry_type_global = p2m_set_flags_global; if ( is_hvm_domain(d) && hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ) ept_p2m_init(d); } +void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags) +{ + p2m_lock(d); + d->arch.p2m.change_entry_type_global(d, l1e_flags); + p2m_unlock(d); +} + static inline int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 l1e_flags) { @@ -882,8 +893,8 @@ void p2m_set_flags_global(struct domain *d, u32 l1e_flags) if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) return; - p2m_lock(d); - + ASSERT(p2m_locked_by_me(d)); + #if CONFIG_PAGING_LEVELS == 4 l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); #elif CONFIG_PAGING_LEVELS == 3 @@ -952,7 +963,6 @@ void p2m_set_flags_global(struct domain *d, u32 l1e_flags) unmap_domain_page(l2e); #endif - p2m_unlock(d); } /* This function traces through P2M table and modifies l1e flags of a specific diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h index 53028d2..50f6245 100644 --- a/include/asm-x86/domain.h +++ b/include/asm-x86/domain.h @@ -144,6 +144,9 @@ struct p2m_domain { mfn_t (*get_entry )(struct domain *d, unsigned long gfn); mfn_t (*get_entry_fast)(unsigned long gfn); + void (*change_entry_type_global)(struct domain *d, + u32 l1e_flags); + /* Highest guest frame that's ever been mapped in the p2m */ unsigned long max_mapped_pfn; }; diff --git a/include/asm-x86/hvm/vmx/vmx.h b/include/asm-x86/hvm/vmx/vmx.h index dc61578..b80d0fa 100644 --- a/include/asm-x86/hvm/vmx/vmx.h +++ b/include/asm-x86/hvm/vmx/vmx.h @@ -47,6 +47,8 @@ typedef union { #define EPT_TABLE_ORDER 9 +extern mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t); + void vmx_asm_vmexit_handler(struct cpu_user_regs); void vmx_asm_do_vmentry(void); void vmx_intr_assist(void); @@ -363,4 +365,47 @@ static inline void vmx_inject_extint(struct vcpu *v, int trap, int error_code) void ept_p2m_init(struct domain *d); +/* EPT violation qualifications definitions */ +/* bit offset 0 in exit qualification */ +#define _EPT_READ_VIOLATION 0 +#define EPT_READ_VIOLATION (1UL<<_EPT_READ_VIOLATION) +/* bit offset 1 in exit qualification */ +#define _EPT_WRITE_VIOLATION 1 +#define EPT_WRITE_VIOLATION (1UL<<_EPT_WRITE_VIOLATION) +/* bit offset 2 in exit qualification */ +#define _EPT_EXEC_VIOLATION 2 +#define EPT_EXEC_VIOLATION (1UL<<_EPT_EXEC_VIOLATION) + +/* bit offset 3 in exit qualification */ +#define _EPT_EFFECTIVE_READ 3 +#define EPT_EFFECTIVE_READ (1UL<<_EPT_EFFECTIVE_READ) +/* bit offset 4 in exit qualification */ +#define _EPT_EFFECTIVE_WRITE 4 +#define EPT_EFFECTIVE_WRITE (1UL<<_EPT_EFFECTIVE_WRITE) +/* bit offset 5 in exit qualification */ +#define _EPT_EFFECTIVE_EXEC 5 +#define EPT_EFFECTIVE_EXEC (1UL<<_EPT_EFFECTIVE_EXEC) + +/* bit offset 6 in exit qualification */ +#define _EPT_GAW_VIOLATION 6 +#define EPT_GAW_VIOLATION (1UL<<_EPT_GAW_VIOLATION) + +/* bits offset 7 & 8 in exit qualification */ +#define _EPT_GLA_VALIDITY 7 +#define EPT_GLA_VALIDITY_MASK (3UL<<_EPT_GLA_VALIDITY) +/* gla != gpa, when load PDPTR */ +#define EPT_GLA_VALIDITY_PDPTR_LOAD (0UL<<_EPT_GLA_VALIDITY) +/* gla != gpa, during guest page table walking */ +#define EPT_GLA_VALIDITY_GPT_WALK (1UL<<_EPT_GLA_VALIDITY) +/* reserved */ +#define EPT_GLA_VALIDITY_RSVD (2UL<<_EPT_GLA_VALIDITY) +/* gla == gpa, normal case */ +#define EPT_GLA_VALIDITY_MATCH (3UL<<_EPT_GLA_VALIDITY) + +#define EPT_EFFECTIVE_MASK (EPT_EFFECTIVE_READ | \ + EPT_EFFECTIVE_WRITE | \ + EPT_EFFECTIVE_EXEC) + +#define EPT_PAGETABLE_ENTRIES 512 + #endif /* __ASM_X86_HVM_VMX_VMX_H__ */ diff --git a/include/asm-x86/p2m.h b/include/asm-x86/p2m.h index 40501e0..71035d0 100644 --- a/include/asm-x86/p2m.h +++ b/include/asm-x86/p2m.h @@ -200,6 +200,8 @@ void guest_physmap_remove_page(struct domain *d, unsigned long gfn, /* set P2M table l1e flags */ void p2m_set_flags_global(struct domain *d, u32 l1e_flags); +void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags); + /* set P2M table l1e flags for a gpa */ int p2m_set_flags(struct domain *d, paddr_t gpa, u32 l1e_flags);