From: Chris Lalancette <clalance@redhat.com> Date: Fri, 10 Apr 2009 09:47:14 +0200 Subject: [xen] x86: GDT: replace single page with one page/CPU Message-id: 49DEF982.6060409@redhat.com O-Subject: [RHEL5.4 PATCH 1/2]: Replace single GDT page with one GDT page per CPU Bugzilla: 477206 RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com> RH-Acked-by: Prarit Bhargava <prarit@redhat.com> As mentioned in 0/2, this patch replaces the single GDT page with a GDT page per-CPU, thus allowing us to have (theoretically) unlimited processors. In the next patch, we will bump this up to 256. This should resolve BZ 477206. diff --git a/arch/x86/boot/x86_32.S b/arch/x86/boot/x86_32.S index b876a7f..39255b4 100644 --- a/arch/x86/boot/x86_32.S +++ b/arch/x86/boot/x86_32.S @@ -78,7 +78,7 @@ idt_descr: .word 0 gdt_descr: .word LAST_RESERVED_GDT_BYTE - .long gdt_table - FIRST_RESERVED_GDT_BYTE + .long boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .align PAGE_SIZE, 0 /* NB. Rings != 0 get access up to MACH2PHYS_VIRT_END. This allows access to */ @@ -86,16 +86,17 @@ gdt_descr: #define GUEST_DESC(d) \ .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff, \ ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d) -ENTRY(gdt_table) - .quad 0x0000000000000000 /* unused */ +ENTRY(boot_cpu_gdt_table) + .quad 0x0000000000000000 /* double fault TSS */ .quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */ GUEST_DESC(0x00c0ba00) /* 0xe019 ring 1 3.xxGB code at 0x0 */ GUEST_DESC(0x00c0b200) /* 0xe021 ring 1 3.xxGB data at 0x0 */ GUEST_DESC(0x00c0fa00) /* 0xe02b ring 3 3.xxGB code at 0x0 */ GUEST_DESC(0x00c0f200) /* 0xe033 ring 3 3.xxGB data at 0x0 */ - .quad 0x0000000000000000 /* unused */ - .fill 2*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ + .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ + .align PAGE_SIZE,0 #ifdef CONFIG_X86_PAE .align 32 diff --git a/arch/x86/boot/x86_64.S b/arch/x86/boot/x86_64.S index 46c34c1..7932f8a 100644 --- a/arch/x86/boot/x86_64.S +++ b/arch/x86/boot/x86_64.S @@ -85,7 +85,7 @@ multiboot_ptr: .word 0 gdt_descr: .word LAST_RESERVED_GDT_BYTE - .quad gdt_table - FIRST_RESERVED_GDT_BYTE + .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .word 0,0,0 idt_descr: @@ -96,7 +96,7 @@ ENTRY(stack_start) .quad cpu0_stack .align PAGE_SIZE, 0 -ENTRY(gdt_table) +ENTRY(boot_cpu_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ @@ -105,13 +105,13 @@ ENTRY(gdt_table) .quad 0x00cff2000000ffff /* 0xe02b ring 3 data */ .quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ - .org gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8 - .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ + .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 /* NB. Even rings != 0 get access to the full 4Gb, as only the */ /* (compatibility) machine->physical mapping table lives there. */ -ENTRY(compat_gdt_table) +ENTRY(boot_cpu_compat_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ @@ -120,5 +120,6 @@ ENTRY(compat_gdt_table) .quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */ .quad 0x00cff2000000ffff /* 0xe033 ring 3 data */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ - .org compat_gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8 - .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ + .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ + .align PAGE_SIZE, 0 diff --git a/arch/x86/cpu/common.c b/arch/x86/cpu/common.c index 268bcc2..5e02243 100644 --- a/arch/x86/cpu/common.c +++ b/arch/x86/cpu/common.c @@ -555,7 +555,10 @@ void __devinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct *t = &init_tss[cpu]; - char gdt_load[10]; + struct desc_ptr gdt_desc = { + .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), + .limit = LAST_RESERVED_GDT_BYTE + }; if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -569,9 +572,10 @@ void __devinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; - *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current); - __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) ); + /* Install correct page table. */ + write_ptbase(current); + + __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_desc) ); /* No nested task. */ __asm__("pushf ; andw $0xbfff,(%"__OP"sp) ; popf"); @@ -599,7 +603,4 @@ void __devinit cpu_init(void) #define CD(register) __asm__("mov %0,%%db" #register ::"r"(0UL) ); CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); #undef CD - - /* Install correct page table. */ - write_ptbase(current); } diff --git a/arch/x86/domain.c b/arch/x86/domain.c index bee2dfb..84aaee4 100644 --- a/arch/x86/domain.c +++ b/arch/x86/domain.c @@ -274,7 +274,6 @@ static inline int may_switch_mode(struct domain *d) int switch_native(struct domain *d) { - l1_pgentry_t gdt_l1e; unsigned int vcpuid; if ( d == NULL ) @@ -287,12 +286,8 @@ int switch_native(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; release_arg_xlat_area(d); - /* switch gdt */ - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) { - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; if (d->vcpu[vcpuid]) release_compat_l4(d->vcpu[vcpuid]); } @@ -302,7 +297,6 @@ int switch_native(struct domain *d) int switch_compat(struct domain *d) { - l1_pgentry_t gdt_l1e; unsigned int vcpuid; if ( d == NULL ) @@ -314,15 +308,11 @@ int switch_compat(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; - /* switch gdt */ - gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) { if ( (d->vcpu[vcpuid] != NULL) && (setup_compat_l4(d->vcpu[vcpuid]) != 0) ) goto undo_and_fail; - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; } d->arch.physaddr_bitsize = @@ -333,14 +323,10 @@ int switch_compat(struct domain *d) undo_and_fail: d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; - release_arg_xlat_area(d); - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); while ( vcpuid-- != 0 ) { if ( d->vcpu[vcpuid] != NULL ) release_compat_l4(d->vcpu[vcpuid]); - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; } return -ENOMEM; } @@ -413,39 +399,28 @@ int arch_domain_create(struct domain *d) struct page_info *pg; int i; #endif - l1_pgentry_t gdt_l1e; - int vcpuid, pdpt_order; + int pdpt_order; int rc = -ENOMEM; INIT_LIST_HEAD(&d->arch.pdev_list); d->arch.relmem = RELMEM_not_started; INIT_LIST_HEAD(&d->arch.relmem_list); + pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order); if ( d->arch.mm_perdomain_pt == NULL ) goto fail; memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order); - /* - * Map Xen segments into every VCPU's GDT, irrespective of whether every - * VCPU will actually be used. This avoids an NMI race during context - * switch: if we take an interrupt after switching CR3 but before switching - * GDT, and the old VCPU# is invalid in the new domain, we would otherwise - * try to load CS from an invalid table. - */ - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); - for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; - #if defined(__i386__) mapcache_init(d); #else /* __x86_64__ */ - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL); + if (pg == NULL) goto fail; d->arch.mm_perdomain_l2 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l2); @@ -454,7 +429,8 @@ int arch_domain_create(struct domain *d) l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i, __PAGE_HYPERVISOR); - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL); + if ( pg == NULL ) goto fail; d->arch.mm_perdomain_l3 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l3); @@ -474,6 +450,7 @@ int arch_domain_create(struct domain *d) { d->arch.ioport_caps = rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); + rc = -ENOMEM; if ( d->arch.ioport_caps == NULL ) goto fail; @@ -502,7 +479,6 @@ int arch_domain_create(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = (CONFIG_PAGING_LEVELS != 4); } - return 0; @@ -1257,12 +1233,19 @@ static void paravirt_ctxt_switch_to(struct vcpu *v) } } +static inline int need_full_gdt(struct vcpu *v) +{ + return (!is_hvm_vcpu(v) && !is_idle_vcpu(v)); +} + static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); unsigned int cpu = smp_processor_id(); struct vcpu *p = per_cpu(curr_vcpu, cpu); struct vcpu *n = current; + struct desc_struct *gdt; + struct desc_ptr gdt_desc; ASSERT(p != n); ASSERT(cpus_empty(n->vcpu_dirty_cpumask)); @@ -1288,14 +1271,35 @@ static void __context_switch(void) cpu_set(cpu, n->domain->domain_dirty_cpumask); cpu_set(cpu, n->vcpu_dirty_cpumask); + gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) : + per_cpu(compat_gdt_table, cpu); + if ( need_full_gdt(n) ) + { + struct page_info *page = virt_to_page(gdt); + unsigned int i; + for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ ) + l1e_write(n->domain->arch.mm_perdomain_pt + + (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE + i, + l1e_from_page(page + i, __PAGE_HYPERVISOR)); + } + + if ( need_full_gdt(p) && + ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) ) + { + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + } + write_ptbase(n); - if ( p->vcpu_id != n->vcpu_id ) + if ( need_full_gdt(n) && + ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) ) { - char gdt_load[10]; - *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; - *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n); - __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) ); + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = GDT_VIRT_START(n); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } if ( p->domain != n->domain ) @@ -1345,9 +1349,6 @@ void context_switch(struct vcpu *prev, struct vcpu *next) { uint64_t efer = read_efer(); - local_flush_tlb_one(GDT_VIRT_START(next) + - FIRST_RESERVED_GDT_BYTE); - if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) ) write_efer(efer ^ EFER_SCE); } diff --git a/arch/x86/domain_build.c b/arch/x86/domain_build.c index 40e6e93..f8e464d 100644 --- a/arch/x86/domain_build.c +++ b/arch/x86/domain_build.c @@ -335,24 +335,11 @@ int __init construct_dom0( #ifdef CONFIG_COMPAT if ( compat32 ) { - l1_pgentry_t gdt_l1e; - d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0]; if ( nr_pages != (unsigned int)nr_pages ) nr_pages = UINT_MAX; - - /* - * Map compatibility Xen segments into every VCPU's GDT. See - * arch_domain_create() for further comments. - */ - gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), - PAGE_HYPERVISOR); - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; - local_flush_tlb_one(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE); } #endif if ( parms.pae == PAEKERN_extended_cr3 ) diff --git a/arch/x86/hvm/vmx/vmcs.c b/arch/x86/hvm/vmx/vmcs.c index a241d9f..8a119d4 100644 --- a/arch/x86/hvm/vmx/vmcs.c +++ b/arch/x86/hvm/vmx/vmcs.c @@ -334,27 +334,14 @@ struct host_execution_env { static void vmx_set_host_env(struct vcpu *v) { - unsigned int tr, cpu; - struct host_execution_env host_env; - struct Xgt_desc_struct desc; - - cpu = smp_processor_id(); - __asm__ __volatile__ ("sidt (%0) \n" :: "a"(&desc) : "memory"); - host_env.idtr_limit = desc.size; - host_env.idtr_base = desc.address; - __vmwrite(HOST_IDTR_BASE, host_env.idtr_base); - - __asm__ __volatile__ ("sgdt (%0) \n" :: "a"(&desc) : "memory"); - host_env.gdtr_limit = desc.size; - host_env.gdtr_base = desc.address; - __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base); - - __asm__ __volatile__ ("str (%0) \n" :: "a"(&tr) : "memory"); - host_env.tr_selector = tr; - host_env.tr_limit = sizeof(struct tss_struct); - host_env.tr_base = (unsigned long) &init_tss[cpu]; - __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector); - __vmwrite(HOST_TR_BASE, host_env.tr_base); + unsigned int cpu = smp_processor_id(); + + __vmwrite(HOST_GDTR_BASE, + (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)); + __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); + + __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3); + __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]); /* * Skip end of cpu_user_regs when entering the hypervisor because the diff --git a/arch/x86/setup.c b/arch/x86/setup.c index 4ff4f6b..2f0204f 100644 --- a/arch/x86/setup.c +++ b/arch/x86/setup.c @@ -109,6 +109,12 @@ extern void early_cpu_init(void); extern void vesa_init(void); extern void vesa_mtrr_init(void); +DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table; +#ifdef CONFIG_COMPAT +DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table) + = boot_cpu_compat_gdt_table; +#endif + struct tss_struct init_tss[NR_CPUS]; char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE]; diff --git a/arch/x86/smpboot.c b/arch/x86/smpboot.c index 027fb05..c23c90b 100644 --- a/arch/x86/smpboot.c +++ b/arch/x86/smpboot.c @@ -805,10 +805,15 @@ static int __devinit do_boot_cpu(int apicid, int cpu) */ { unsigned long boot_error; + unsigned int order; int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; struct vcpu *v; + struct desc_struct *gdt; +#ifdef __x86_64__ + struct page_info *page; +#endif ++cpucount; @@ -828,6 +833,41 @@ static int __devinit do_boot_cpu(int apicid, int cpu) /* Debug build: detect stack overflow by setting up a guard page. */ memguard_guard_stack(stack_start.esp); + gdt = per_cpu(gdt_table, cpu); + if (gdt == boot_cpu_gdt_table) { + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); +#ifdef __x86_64__ +#ifdef CONFIG_COMPAT + page = alloc_domheap_pages(NULL, order, 0); + per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page); + memcpy(gdt, boot_cpu_compat_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; +#endif + page = alloc_domheap_pages(NULL, order, 0); + per_cpu(gdt_table, cpu) = gdt = page_to_virt(page); +#else + per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order); +#endif + memcpy(gdt, boot_cpu_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + BUILD_BUG_ON(NR_CPUS > 0x10000); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + } + +#ifdef __i386__ + if (!per_cpu(doublefault_tss, cpu)) { + per_cpu(doublefault_tss, cpu) = alloc_xenheap_page(); + memset(per_cpu(doublefault_tss, cpu), 0, PAGE_SIZE); + } +#endif + + if (!idt_tables[cpu]) { + idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); + memcpy(idt_tables[cpu], idt_table, + IDT_ENTRIES*sizeof(idt_entry_t)); + } + /* * This grunge runs the startup process for * the targeted processor. diff --git a/arch/x86/traps.c b/arch/x86/traps.c index 8e72af7..5de70ee 100644 --- a/arch/x86/traps.c +++ b/arch/x86/traps.c @@ -2192,13 +2192,13 @@ void set_task_gate(unsigned int n, unsigned int sel) void set_tss_desc(unsigned int n, void *addr) { _set_tssldt_desc( - gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 9); #ifdef CONFIG_COMPAT _set_tssldt_desc( - compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 11); diff --git a/arch/x86/x86_32/mm.c b/arch/x86/x86_32/mm.c index 9b3780f..ae4a340 100644 --- a/arch/x86/x86_32/mm.c +++ b/arch/x86/x86_32/mm.c @@ -191,7 +191,7 @@ void __init subarch_init_memory(void) { /* Guest kernel runs in ring 0, not ring 1. */ struct desc_struct *d; - d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; + d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; d[0].b &= ~_SEGMENT_DPL; d[1].b &= ~_SEGMENT_DPL; } diff --git a/arch/x86/x86_32/supervisor_mode_kernel.S b/arch/x86/x86_32/supervisor_mode_kernel.S index c795704..39a74ac 100644 --- a/arch/x86/x86_32/supervisor_mode_kernel.S +++ b/arch/x86/x86_32/supervisor_mode_kernel.S @@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack) # %gs:%esi now points to the guest stack before the # interrupt/exception occured. - /* - * Reverse the __TSS macro, giving us the CPU number. - * The TSS for this cpu is at init_tss + ( cpu * 128 ). - */ - str %ecx - shrl $3,%ecx # Calculate GDT index for TSS. - subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu. - shll $6,%ecx # Each TSS entry is 0x80 bytes - addl $init_tss,%ecx # but we have 2*cpu from above. + movl $PER_CPU_GDT_ENTRY*8,%ecx + lsll %ecx,%ecx + shll $7,%ecx # Each TSS entry is 0x80 bytes + addl $init_tss,%ecx # Load Xen stack from TSS. movw TSS_ss0(%ecx),%ax diff --git a/arch/x86/x86_32/traps.c b/arch/x86/x86_32/traps.c index ebd174d..666a94a 100644 --- a/arch/x86/x86_32/traps.c +++ b/arch/x86/x86_32/traps.c @@ -136,19 +136,20 @@ void show_page_walk(unsigned long addr) unmap_domain_page(l1t); } -#define DOUBLEFAULT_STACK_SIZE 2048 -static struct tss_struct doublefault_tss; -static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE]; - +DEFINE_PER_CPU(struct tss_struct *, doublefault_tss); +static unsigned char __attribute__ ((__section__ (".bss.page_aligned"))) + boot_cpu_doublefault_space[PAGE_SIZE]; asmlinkage void do_double_fault(void) { - struct tss_struct *tss = &doublefault_tss; - unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1; + struct tss_struct *tss; + unsigned int cpu; watchdog_disable(); console_force_unlock(); + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); + /* Find information saved during fault and dump it to the console. */ tss = &init_tss[cpu]; printk("*** DOUBLE FAULT ***\n"); @@ -234,34 +235,36 @@ unsigned long do_iret(void) void __init percpu_traps_init(void) { - struct tss_struct *tss = &doublefault_tss; + struct tss_struct *tss = this_cpu(doublefault_tss); asmlinkage int hypercall(void); - if ( smp_processor_id() != 0 ) - return; + if ( !tss ) + { + /* The hypercall entry vector is only accessible from ring 1. */ + _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall); - /* The hypercall entry vector is only accessible from ring 1. */ - _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall); + tss = (void *)boot_cpu_doublefault_space; + this_cpu(doublefault_tss) = tss; + } /* * Make a separate task for double faults. This will get us debug output if * we blow the kernel stack. */ - memset(tss, 0, sizeof(*tss)); tss->ds = __HYPERVISOR_DS; tss->es = __HYPERVISOR_DS; tss->ss = __HYPERVISOR_DS; - tss->esp = (unsigned long)&doublefault_stack[DOUBLEFAULT_STACK_SIZE]; + tss->esp = (unsigned long)tss + PAGE_SIZE; tss->__cr3 = __pa(idle_pg_table); tss->cs = __HYPERVISOR_CS; tss->eip = (unsigned long)do_double_fault; tss->eflags = 2; tss->bitmap = IOBMP_INVALID_OFFSET; _set_tssldt_desc( - gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, + this_cpu(gdt_table) + DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, 235, 9); - set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3); + set_task_gate(TRAP_double_fault, DOUBLEFAULT_TSS_ENTRY << 3); } void init_int80_direct_trap(struct vcpu *v) diff --git a/arch/x86/x86_64/traps.c b/arch/x86/x86_64/traps.c index 7a9c2c1..d5615b3 100644 --- a/arch/x86/x86_64/traps.c +++ b/arch/x86/x86_64/traps.c @@ -147,15 +147,14 @@ void show_page_walk(unsigned long addr) asmlinkage void double_fault(void); asmlinkage void do_double_fault(struct cpu_user_regs *regs) { - unsigned int cpu, tr; - - asm ( "str %0" : "=r" (tr) ); - cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2; + unsigned int cpu; watchdog_disable(); console_force_unlock(); + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); + /* Find information saved during fault and dump it to the console. */ printk("*** DOUBLE FAULT ***\n"); print_xen_info(); diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index a79627f..1797427 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -5,11 +5,7 @@ * Xen reserves a memory page of GDT entries. * No guest GDT entries exist beyond the Xen reserved area. */ -#if MAX_PHYS_CPUS > 64 -#define NR_RESERVED_GDT_PAGES 2 -#else #define NR_RESERVED_GDT_PAGES 1 -#endif #define NR_RESERVED_GDT_BYTES (NR_RESERVED_GDT_PAGES * PAGE_SIZE) #define NR_RESERVED_GDT_ENTRIES (NR_RESERVED_GDT_BYTES / 8) @@ -38,11 +34,9 @@ #define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS #define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2) - -#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define LDT_ENTRY (TSS_ENTRY + 2) +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2) #elif defined(__i386__) @@ -53,19 +47,17 @@ #define FLAT_COMPAT_USER_DS FLAT_USER_DS #define FLAT_COMPAT_USER_SS FLAT_USER_SS -#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY - -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1) +#define DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY -#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY) +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define LDT_ENTRY (TSS_ENTRY + 1) +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1) #endif #ifndef __ASSEMBLY__ -#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (__TSS(n)<<3) ) +#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (TSS_ENTRY<<3) ) #if defined(__x86_64__) #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3) @@ -198,20 +190,25 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ "rorl $16,%%eax" \ : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type|0x80)) +DECLARE_PER_CPU(struct tss_struct *, doublefault_tss); + #endif -extern struct desc_struct gdt_table[]; +struct desc_ptr { + unsigned short limit; + unsigned long base; +} __attribute__((__packed__)) ; + +extern struct desc_struct boot_cpu_gdt_table[]; +DECLARE_PER_CPU(struct desc_struct *, gdt_table); #ifdef CONFIG_COMPAT -extern struct desc_struct compat_gdt_table[]; +extern struct desc_struct boot_cpu_compat_gdt_table[]; +DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table); #else -# define compat_gdt_table gdt_table +# define boot_cpu_compat_gdt_table boot_cpu_gdt_table +# define per_cpu__compat_gdt_table per_cpu__gdt_table #endif -struct Xgt_desc_struct { - unsigned short size; - unsigned long address __attribute__((packed)); -}; - extern void set_intr_gate(unsigned int irq, void * addr); extern void set_system_gate(unsigned int n, void *addr); extern void set_task_gate(unsigned int n, unsigned int sel); diff --git a/include/asm-x86/ldt.h b/include/asm-x86/ldt.h index b9a769f..c7f3f06 100644 --- a/include/asm-x86/ldt.h +++ b/include/asm-x86/ldt.h @@ -6,7 +6,6 @@ static inline void load_LDT(struct vcpu *v) { - unsigned int cpu; struct desc_struct *desc; unsigned long ents; @@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu *v) } else { - cpu = smp_processor_id(); - desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table) - + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY; + desc = (!is_pv_32on64_vcpu(v) + ? this_cpu(gdt_table) : this_cpu(compat_gdt_table)) + + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY; _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2); - __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) ); + __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) ); } }