Sophie: kernel-2.6.18-274.7.1.el5 src

kernel-2.6.18-274.7.1.el5.src.rpm

diff -Naurp xen/acm/acm_simple_type_enforcement_hooks.c xen-redhat/acm/acm_simple_type_enforcement_hooks.c
--- xen/acm/acm_simple_type_enforcement_hooks.c
+++ xen-redhat/acm/acm_simple_type_enforcement_hooks.c
@@ -203,10 +203,10 @@ ste_init_state(struct acm_sized_buffer *
                     __func__, d->domain_id, ste_ssidref);
         /* a) check for event channel conflicts */
         for (bucket = 0; bucket < NR_EVTCHN_BUCKETS; bucket++) {
-            spin_lock(&d->evtchn_lock);
+            spin_lock(&d->event_lock);
             ports = d->evtchn[bucket];
             if (ports == NULL) {
-                spin_unlock(&d->evtchn_lock);
+                spin_unlock(&d->event_lock);
                 break;
             }
 
@@ -231,7 +231,7 @@ ste_init_state(struct acm_sized_buffer *
                     printkd("%s: Policy violation in event channel domain "
                             "%x -> domain %x.\n",
                             __func__, d->domain_id, rdomid);
-                    spin_unlock(&d->evtchn_lock);
+                    spin_unlock(&d->event_lock);
 
                     acm_array_append_tuple(errors,
                                            ACM_EVTCHN_SHARING_VIOLATION,
@@ -239,7 +239,7 @@ ste_init_state(struct acm_sized_buffer *
                     goto out;
                 }
             }
-            spin_unlock(&d->evtchn_lock);
+            spin_unlock(&d->event_lock);
         } 
 
 
diff -Naurp xen/arch/ia64/asm-offsets.c xen-redhat/arch/ia64/asm-offsets.c
--- xen/arch/ia64/asm-offsets.c
+++ xen-redhat/arch/ia64/asm-offsets.c
@@ -76,6 +76,7 @@ void foo(void)
 	BLANK();
 
 	DEFINE(IA64_DOMAIN_SHADOW_BITMAP_OFFSET, offsetof (struct domain, arch.shadow_bitmap));
+	DEFINE(IA64_DOMAIN_RID_BITS_OFFSET, offsetof (struct domain, arch.rid_bits));
 
 	BLANK();
 
diff -Naurp xen/arch/ia64/linux-xen/entry.S xen-redhat/arch/ia64/linux-xen/entry.S
--- xen/arch/ia64/linux-xen/entry.S
+++ xen-redhat/arch/ia64/linux-xen/entry.S
@@ -905,7 +905,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	;;
 (pUStk)	ssm psr.i
 (pUStk)	br.call.sptk.many b0=do_softirq
-(pUStk)	ssm psr.i
+(pUStk)	rsm psr.i
 	;;
 (pUStk)	br.call.sptk.many b0=reflect_event
 	;;
diff -Naurp xen/arch/ia64/linux-xen/pal.S xen-redhat/arch/ia64/linux-xen/pal.S
--- xen/arch/ia64/linux-xen/pal.S
+++ xen-redhat/arch/ia64/linux-xen/pal.S
@@ -21,11 +21,12 @@ pal_entry_point:
 	.text
 
 /*
- * Set the PAL entry point address.  This could be written in C code, but we do it here
- * to keep it all in one module (besides, it's so trivial that it's
+ * Set the PAL entry point address.  This could be written in C code, but we
+ * do it here to keep it all in one module (besides, it's so trivial that it's
  * not a big deal).
  *
- * in0		Address of the PAL entry point (text address, NOT a function descriptor).
+ * in0		Address of the PAL entry point (text address, NOT a function
+ *		descriptor).
  */
 GLOBAL_ENTRY(ia64_pal_handler_init)
 	alloc r3=ar.pfs,1,0,0,0
@@ -36,9 +37,9 @@ GLOBAL_ENTRY(ia64_pal_handler_init)
 END(ia64_pal_handler_init)
 
 /*
- * Default PAL call handler.  This needs to be coded in assembly because it uses
- * the static calling convention, i.e., the RSE may not be used and calls are
- * done via "br.cond" (not "br.call").
+ * Default PAL call handler.  This needs to be coded in assembly because it
+ * uses the static calling convention, i.e., the RSE may not be used and
+ * calls are done via "br.cond" (not "br.call").
  */
 GLOBAL_ENTRY(ia64_pal_default_handler)
 	mov r8=-1
@@ -50,12 +51,10 @@ END(ia64_pal_default_handler)
  *
  * in0         Index of PAL service
  * in1 - in3   Remaining PAL arguments
- * in4	       1 ==> clear psr.ic,  0 ==> don't clear psr.ic
- *
  */
 GLOBAL_ENTRY(ia64_pal_call_static)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
-	alloc loc1 = ar.pfs,5,5,0,0
+	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
+	alloc loc1 = ar.pfs,4,5,0,0
 	movl loc2 = pal_entry_point
 1:	{
 	  mov r28 = in0
@@ -64,7 +63,6 @@ GLOBAL_ENTRY(ia64_pal_call_static)
 	}
 	;;
 	ld8 loc2 = [loc2]		// loc2 <- entry point
-	tbit.nz p6,p7 = in4, 0
 	adds r8 = 1f-1b,r8
 	mov loc4=ar.rsc			// save RSE configuration
 	;;
@@ -74,13 +72,11 @@ GLOBAL_ENTRY(ia64_pal_call_static)
 	.body
 	mov r30 = in2
 
-(p6)	rsm psr.i | psr.ic
 	mov r31 = in3
 	mov b7 = loc2
 
-(p7)	rsm psr.i
+	rsm psr.i
 	;;
-(p6)	srlz.i
 	mov rp = r8
 	br.cond.sptk.many b7
 1:	mov psr.l = loc3
@@ -96,8 +92,8 @@ END(ia64_pal_call_static)
  * Make a PAL call using the stacked registers calling convention.
  *
  * Inputs:
- * 	in0         Index of PAL service
- * 	in2 - in3   Remaning PAL arguments
+ *	in0         Index of PAL service
+ *	in2 - in3   Remaining PAL arguments
  */
 GLOBAL_ENTRY(ia64_pal_call_stacked)
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
@@ -131,18 +127,18 @@ END(ia64_pal_call_stacked)
  * Make a physical mode PAL call using the static registers calling convention.
  *
  * Inputs:
- * 	in0         Index of PAL service
- * 	in2 - in3   Remaning PAL arguments
+ *	in0         Index of PAL service
+ *	in2 - in3   Remaining PAL arguments
  *
  * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel.
  * So we don't need to clear them.
  */
-#define PAL_PSR_BITS_TO_CLEAR							\
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |	\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |		\
+#define PAL_PSR_BITS_TO_CLEAR						      \
+	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |\
+	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	      \
 	 IA64_PSR_DFL | IA64_PSR_DFH)
 
-#define PAL_PSR_BITS_TO_SET							\
+#define PAL_PSR_BITS_TO_SET						      \
 	(IA64_PSR_BN)
 
 
@@ -182,7 +178,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static)
 	;;
 	andcm r16=loc3,r16		// removes bits to clear from psr
 	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret1:	mov rp = r8			// install return address (physical)
+	mov rp = r8			// install return address (physical)
 	mov loc5 = r19
 	mov loc6 = r20
 	br.cond.sptk.many b7
@@ -192,7 +188,6 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static)
 	mov r19=loc5
 	mov r20=loc6
 	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-.ret2:
 	mov psr.l = loc3		// restore init PSR
 
 	mov ar.pfs = loc1
@@ -207,8 +202,8 @@ END(ia64_pal_call_phys_static)
  * Make a PAL call using the stacked registers in physical mode.
  *
  * Inputs:
- * 	in0         Index of PAL service
- * 	in2 - in3   Remaning PAL arguments
+ *	in0         Index of PAL service
+ *	in2 - in3   Remaining PAL arguments
  */
 GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
@@ -216,17 +211,12 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	movl	loc2 = pal_entry_point
 1:	{
 	  mov r28  = in0		// copy procedure index
-	  mov loc0 = rp		// save rp
+	  mov loc0 = rp			// save rp
 	}
 	.body
 	;;
 	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov out0 = in0		// first argument
-	mov out1 = in1		// copy arg2
-	mov out2 = in2		// copy arg3
-	mov out3 = in3		// copy arg3
-	;;
-	mov loc3 = psr		// save psr
+	mov loc3 = psr			// save psr
 	;;
 	mov loc4=ar.rsc			// save RSE configuration
 #ifdef XEN
@@ -244,18 +234,23 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	;;
 	andcm r16=loc3,r16		// removes bits to clear from psr
 	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret6:
+
+	mov out0 = in0			// first argument
+	mov out1 = in1			// copy arg2
+	mov out2 = in2			// copy arg3
+	mov out3 = in3			// copy arg3
 	mov loc5 = r19
 	mov loc6 = r20
+
 	br.call.sptk.many rp=b7		// now make the call
-.ret7:
+
 	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
 	mov r16=loc3			// r16= original psr
 	mov r19=loc5
 	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt	// return to virtual mode
+	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
 
-.ret8:	mov psr.l  = loc3		// restore init PSR
+	mov psr.l  = loc3		// restore init PSR
 	mov ar.pfs = loc1
 	mov rp = loc0
 	;;
@@ -265,10 +260,11 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 END(ia64_pal_call_phys_stacked)
 
 /*
- * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15).
+ * Save scratch fp scratch regs which aren't saved in pt_regs already
+ * (fp10-fp15).
  *
- * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch
- * regs fp-low partition.
+ * NOTE: We need to do this since firmware (SAL and PAL) may use any of the
+ * scratch regs fp-low partition.
  *
  * Inputs:
  *      in0	Address of stack storage for fp regs
diff -Naurp xen/arch/ia64/linux-xen/README.origin xen-redhat/arch/ia64/linux-xen/README.origin
--- xen/arch/ia64/linux-xen/README.origin
+++ xen-redhat/arch/ia64/linux-xen/README.origin
@@ -18,7 +18,6 @@ minstate.h		-> linux/arch/ia64/kernel/mi
 mm_contig.c		-> linux/arch/ia64/mm/contig.c
 mm_numa.c		-> linux/arch/ia64/mm/numa.c
 numa.c			-> linux/arch/ia64/kernel/numa.c
-pal.S			-> linux/arch/ia64/kernel/pal.S
 process-linux-xen.c	-> linux/arch/ia64/kernel/process.c
 sal.c			-> linux/arch/ia64/kernel/sal.c
 setup.c			-> linux/arch/ia64/kernel/setup.c
@@ -42,3 +41,5 @@ perfmon_generic.h	-> linux/arch/kernel/p
 perfmon_itanium.h	-> linux/arch/kernel/perfmon_itanium.h
 perfmon_mckinley.h	-> linux/arch/kernel/perfmon_mckinley.h
 perfmon_montecito.h	-> linux/arch/kernel/perfmon_montecito.h
+# The files below are from Linux-2.6.21
+pal.S			-> linux/arch/ia64/kernel/pal.S
diff -Naurp xen/arch/ia64/linux-xen/setup.c xen-redhat/arch/ia64/linux-xen/setup.c
--- xen/arch/ia64/linux-xen/setup.c
+++ xen-redhat/arch/ia64/linux-xen/setup.c
@@ -368,16 +368,21 @@ acpi_oem_console_setup(void)
 	 * Tiger 2: SR870BH2
 	 * Tiger 4: SR870BN4
 	 */
-	if (strncmp(hdr->oem_id, "INTEL", 5) ||
-	    (!strncmp(hdr->oem_table_id, "SR870BH2", 8) &&
-	     !strncmp(hdr->oem_table_id, "SR870BN4", 8)))
-		return -ENODEV;
-
-	ns16550_com1.baud = BAUD_AUTO;
-	ns16550_com1.io_base = 0x2f8;
-	ns16550_com1.irq = 3;
-
-	return 0;
+	if (!strncmp(hdr->oem_id, "INTEL", 5)) {
+		if (!strncmp(hdr->oem_table_id, "SR870BH2", 8) ||
+		    !strncmp(hdr->oem_table_id, "SR870BN4", 8)) {
+			ns16550_com1.baud = BAUD_AUTO;
+			ns16550_com1.io_base = 0x2f8;
+			ns16550_com1.irq = 3;
+			return 0;
+		} else {
+			ns16550_com1.baud = BAUD_AUTO;
+			ns16550_com1.io_base = 0x3f8;
+			ns16550_com1.irq = ns16550_com1_gsi = 4;
+			return 0;
+		}
+	}
+	return -ENODEV;
 }
 #endif
 
@@ -873,7 +878,7 @@ cpu_init (void)
 	cpu_data = per_cpu_init();
 
 #ifdef XEN
-	printk("cpu_init: current=%p\n", current);
+	printk(XENLOG_DEBUG "cpu_init: current=%p\n", current);
 #endif
 
 	/*
diff -Naurp xen/arch/ia64/linux-xen/smp.c xen-redhat/arch/ia64/linux-xen/smp.c
--- xen/arch/ia64/linux-xen/smp.c
+++ xen-redhat/arch/ia64/linux-xen/smp.c
@@ -122,9 +122,7 @@ stop_this_cpu (void)
 	cpu_clear(smp_processor_id(), cpu_online_map);
 	max_xtp();
 	local_irq_disable();
-#ifndef XEN
 	cpu_halt();
-#endif
 }
 
 void
@@ -132,9 +130,7 @@ cpu_die(void)
 {
 	max_xtp();
 	local_irq_disable();
-#ifndef XEN
 	cpu_halt();
-#endif
 	/* Should never be here */
 	BUG();
 	for (;;);
diff -Naurp xen/arch/ia64/linux-xen/sn/kernel/irq.c xen-redhat/arch/ia64/linux-xen/sn/kernel/irq.c
--- xen/arch/ia64/linux-xen/sn/kernel/irq.c
+++ xen-redhat/arch/ia64/linux-xen/sn/kernel/irq.c
@@ -12,7 +12,7 @@
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #ifdef XEN
-#include <linux/pci.h>
+#include <linux/linux-pci.h>
 #include <asm/hw_irq.h>
 #endif
 #include <asm/sn/addrs.h>
diff -Naurp xen/arch/ia64/vmx/Makefile xen-redhat/arch/ia64/vmx/Makefile
--- xen/arch/ia64/vmx/Makefile
+++ xen-redhat/arch/ia64/vmx/Makefile
@@ -18,3 +18,4 @@ obj-y += vmx_virt.o
 obj-y += vmx_vsa.o
 obj-y += vtlb.o
 obj-y += optvfault.o
+obj-y += vacpi.o
diff -Naurp xen/arch/ia64/vmx/mmio.c xen-redhat/arch/ia64/vmx/mmio.c
--- xen/arch/ia64/vmx/mmio.c
+++ xen-redhat/arch/ia64/vmx/mmio.c
@@ -37,6 +37,7 @@
 #include <xen/domain.h>
 #include <asm/viosapic.h>
 #include <asm/vlsapic.h>
+#include <asm/hvm/vacpi.h>
 
 #define HVM_BUFFERED_IO_RANGE_NR 1
 
@@ -214,6 +215,9 @@ static void legacy_io_access(VCPU *vcpu,
     if (vmx_ide_pio_intercept(p, val))
         return;
 
+    if (IS_ACPI_ADDR(p->addr) && vacpi_intercept(p, val))
+	return;
+
     vmx_send_assist_req(v);
     if(dir==IOREQ_READ){ //read
         *val=p->data;
diff -Naurp xen/arch/ia64/vmx/optvfault.S xen-redhat/arch/ia64/vmx/optvfault.S
--- xen/arch/ia64/vmx/optvfault.S
+++ xen-redhat/arch/ia64/vmx/optvfault.S
@@ -7,6 +7,8 @@
  */
 
 #include <linux/config.h>
+#include <asm/config.h>
+#include <asm/pgtable.h>
 #include <asm/asmmacro.h>
 #include <asm/kregs.h>
 #include <asm/offsets.h>
@@ -16,6 +18,7 @@
 #include <asm/vmx_pal_vsa.h>
 #include <asm/asm-offsets.h>
 #include <asm-ia64/vmx_mm_def.h>
+#include <asm/virt_event.h>
 
 #define ACCE_MOV_FROM_AR
 #define ACCE_MOV_FROM_RR
@@ -25,6 +28,94 @@
 #define ACCE_MOV_TO_PSR
 #define ACCE_THASH
 
+// Inputs are: r21 (= current), r24 (= cause), r25 (= insn), r31 (=saved pr)
+
+ENTRY(vmx_dummy_function)
+    br.sptk.many vmx_dummy_function
+END(vmx_dummy_function)
+
+/*
+ *	Inputs:
+ *		r24 : return address
+ *  	r25 : vpd
+ *		r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(vmx_vps_sync_read)
+    movl r29 = vmx_dummy_function
+    ;;
+    mov b0=r29
+    br.sptk.many b0
+END(vmx_vps_sync_read)
+
+/*
+ *	Inputs:
+ *		r24 : return address
+ *  	r25 : vpd
+ *		r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(vmx_vps_sync_write)
+    movl r29 = vmx_dummy_function
+    ;;
+    mov b0=r29
+    br.sptk.many b0
+END(vmx_vps_sync_write)
+
+/*
+ *	Inputs:
+ *		r23 : pr
+ *		r24 : guest b0
+ *  	r25 : vpd
+ *
+ */
+GLOBAL_ENTRY(vmx_vps_resume_normal)
+    movl r29 = vmx_dummy_function
+    ;;
+    mov b0=r29
+    mov pr=r23,-2
+    br.sptk.many b0
+END(vmx_vps_resume_normal)
+
+#define VMX_VPS_SYNC_READ		\
+     add r16=IA64_VPD_BASE_OFFSET,r21;	\
+     mov r17 = b0;			\
+     mov r18 = r24;			\
+     mov r19 = r25;			\
+     mov r20 = r31;			\
+     ;;					\
+     movl r24 = 1f;			\
+     ld8 r16 = [r16];			\
+     ;;					\
+     mov r25 =r16;			\
+     br.sptk.many vmx_vps_sync_read;	\
+1:					\
+     mov b0 = r17;			\
+     mov r24 = r18;			\
+     mov r25 = r19;			\
+     mov r31 = r20
+   
+
+/*
+ *	Inputs:
+ *		r23 : pr
+ *		r24 : guest b0
+ *  	r25 : vpd
+ *		r17 : isr
+ */
+GLOBAL_ENTRY(vmx_vps_resume_handler)
+    movl r29 = vmx_dummy_function
+    ;;
+    ld8 r26=[r25]
+    shr r17=r17,IA64_ISR_IR_BIT
+    ;;
+    dep r26=r17,r26,63,1   // bit 63 of r26 indicate whether enable CFLE
+    mov b0=r29
+    mov pr=r23,-2
+    br.sptk.many b0
+END(vmx_vps_resume_handler)
+
+
 //mov r1=ar3
 GLOBAL_ENTRY(vmx_asm_mov_from_ar)
 #ifndef ACCE_MOV_FROM_AR
@@ -42,7 +133,7 @@ GLOBAL_ENTRY(vmx_asm_mov_from_ar)
     add r19=r19,r18
     movl r20=asm_mov_to_reg
     ;;
-    adds r30=vmx_resume_to_guest-asm_mov_to_reg,r20
+    adds r30=vmx_resume_to_guest2-asm_mov_to_reg,r20
     shladd r17=r17,4,r20
     cmp.gtu p6,p0=r16,r19
     ;;
@@ -71,7 +162,7 @@ GLOBAL_ENTRY(vmx_asm_mov_from_rr)
     br.many b0
     ;;   
 vmx_asm_mov_from_rr_back_1:  
-    adds r30=vmx_resume_to_guest-asm_mov_from_reg,r20
+    adds r30=vmx_resume_to_guest2-asm_mov_from_reg,r20
     adds r22=asm_mov_to_reg-asm_mov_from_reg,r20
     shr.u r26=r19,61
     ;;
@@ -89,13 +180,16 @@ GLOBAL_ENTRY(vmx_asm_mov_to_rr)
 #ifndef ACCE_MOV_TO_RR
     br.many vmx_virtualization_fault_back
 #endif
-    extr.u r16=r25,20,7
-    extr.u r17=r25,13,7
+    add r22=IA64_VCPU_DOMAIN_OFFSET,r21
+    extr.u r16=r25,20,7		// r3
+    extr.u r17=r25,13,7		// r2
+    ;;
+    ld8 r22=[r22]		// Get domain
     movl r20=asm_mov_from_reg
     ;;
     adds r30=vmx_asm_mov_to_rr_back_1-asm_mov_from_reg,r20
-    shladd r16=r16,4,r20
-    mov r22=b0
+    shladd r16=r16,4,r20	// get r3
+    mov r18=b0			// save b0
     ;;
     add r27=VCPU_VRR0_OFS,r21
     mov b0=r16
@@ -103,47 +197,56 @@ GLOBAL_ENTRY(vmx_asm_mov_to_rr)
     ;;   
 vmx_asm_mov_to_rr_back_1:
     adds r30=vmx_asm_mov_to_rr_back_2-asm_mov_from_reg,r20
-    shr.u r23=r19,61
-    shladd r17=r17,4,r20
+    shr.u r23=r19,61		// get RR #
+    shladd r17=r17,4,r20	// get r2
     ;;
     //if rr7, go back
     cmp.eq p6,p0=7,r23
-    mov b0=r22
+    mov b0=r18			// restore b0
     (p6) br.cond.dpnt.many vmx_virtualization_fault_back
     ;;
-    mov r28=r19
+    mov r28=r19			// save r3
     mov b0=r17
     br.many b0
 vmx_asm_mov_to_rr_back_2: 
-    adds r30=vmx_resume_to_guest-asm_mov_from_reg,r20
-    shladd r27=r23,3,r27
-    ;; // +starting_rid
-    st8 [r27]=r19
-    mov b0=r30
+    adds r30=vmx_resume_to_guest2-asm_mov_from_reg,r20
+    shladd r27=r23,3,r27	// address of VRR
+    add r22=IA64_DOMAIN_RID_BITS_OFFSET,r22
     ;;
+    ld1 r22=[r22]		// Load rid_bits from domain
+    mov b0=r18			// restore b0
     adds r16=IA64_VCPU_STARTING_RID_OFFSET,r21
     ;;
-    ld4 r16=[r16]
+    ld4 r16=[r16]		// load starting_rid
+    extr.u r17=r19,8,24		// Extract RID
     ;;
+    shr r17=r17,r22		// Shift out used bits
     shl r16=r16,8
     ;;
-    add r19=r19,r16
+    add r20=r19,r16
+    cmp.ne p6,p0=0,r17 // If reserved RID bits are set, use C fall back.
+    (p6) br.cond.dpnt.many vmx_virtualization_fault_back
     ;; //mangling rid 1 and 3
-    extr.u r16=r19,8,8
-    extr.u r17=r19,24,8
-    extr.u r18=r19,2,6
+    extr.u r16=r20,8,8
+    extr.u r17=r20,24,8
+    mov r24=r18			// saved b0 for resume
     ;;
-    dep r19=r16,r19,24,8
+    extr.u r18=r20,2,6 // page size
+    dep r20=r16,r20,24,8
+    mov b0=r30
     ;;
-    dep r19=r17,r19,8,8
+    dep r20=r17,r20,8,8
     ;; //set ve 1
-    dep r19=-1,r19,0,1  
-    cmp.lt p6,p0=14,r18
+    dep r20=-1,r20,0,1
+    // If ps > PAGE_SHIFT, use PAGE_SHIFT
+    cmp.lt p6,p0=PAGE_SHIFT,r18
     ;;
-    (p6) mov r18=14
+    (p6) mov r18=PAGE_SHIFT
     ;;
-    (p6) dep r19=r18,r19,2,6
+    (p6) dep r20=r18,r20,2,6
     ;;
+    st8 [r27]=r19	// Write to vrr.
+    // Write to sav_rr if rr=0 or rr=4.
     cmp.eq p6,p0=0,r23
     ;;
     cmp.eq.or p6,p0=4,r23
@@ -155,11 +258,10 @@ vmx_asm_mov_to_rr_back_2: 
     cmp.eq p7,p0=r0,r0
     (p6) shladd r17=r23,1,r17
     ;;
-    (p6) st8 [r17]=r19
+    (p6) st8 [r17]=r20
     (p6) tbit.nz p6,p7=r16,0
     ;;
-    (p7) mov rr[r28]=r19
-    mov r24=r22
+    (p7) mov rr[r28]=r20
     br.many b0
 END(vmx_asm_mov_to_rr)
 
@@ -169,11 +271,11 @@ GLOBAL_ENTRY(vmx_asm_rsm)
 #ifndef ACCE_RSM
     br.many vmx_virtualization_fault_back
 #endif
-    add r16=IA64_VPD_BASE_OFFSET,r21
+    VMX_VPS_SYNC_READ
+    ;;
     extr.u r26=r25,6,21
     extr.u r27=r25,31,2
     ;;
-    ld8 r16=[r16]
     extr.u r28=r25,36,1
     dep r26=r27,r26,21,2
     ;;
@@ -231,11 +333,11 @@ GLOBAL_ENTRY(vmx_asm_ssm)
 #ifndef ACCE_SSM
     br.many vmx_virtualization_fault_back
 #endif
-    add r16=IA64_VPD_BASE_OFFSET,r21
+    VMX_VPS_SYNC_READ
+    ;;
     extr.u r26=r25,6,21
     extr.u r27=r25,31,2
     ;;
-    ld8 r16=[r16]
     extr.u r28=r25,36,1
     dep r26=r27,r26,21,2
     ;;  //r26 is imm24
@@ -305,10 +407,9 @@ GLOBAL_ENTRY(vmx_asm_mov_to_psr)
 #ifndef ACCE_MOV_TO_PSR
     br.many vmx_virtualization_fault_back
 #endif
-    add r16=IA64_VPD_BASE_OFFSET,r21
-    extr.u r26=r25,13,7 //r2
+    VMX_VPS_SYNC_READ
     ;;
-    ld8 r16=[r16]
+    extr.u r26=r25,13,7 //r2
     movl r20=asm_mov_from_reg
     ;;
     adds r30=vmx_asm_mov_to_psr_back-asm_mov_from_reg,r20
@@ -403,7 +504,18 @@ END(vmx_asm_mov_to_psr)
 
 ENTRY(vmx_asm_dispatch_vexirq)
 //increment iip
+    mov r17 = b0
+    mov r18 = r31
+    add r25=IA64_VPD_BASE_OFFSET,r21;
+    movl r24 =1f
+    ;;
+    ld8 r25 = [r25]
+    br.sptk.many vmx_vps_sync_write
+1: 
+    mov b0 =r17
     mov r16=cr.ipsr
+    mov r31 = r18
+    mov r19 = 37
     ;;
     extr.u r17=r16,IA64_PSR_RI_BIT,2
     tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
@@ -420,7 +532,7 @@ ENTRY(vmx_asm_dispatch_vexirq)
     br.many vmx_dispatch_vexirq
 END(vmx_asm_dispatch_vexirq)
 
-// thash
+// thash r1=r3
 // TODO: add support when pta.vf = 1
 GLOBAL_ENTRY(vmx_asm_thash)
 #ifndef ACCE_THASH
@@ -433,8 +545,7 @@ GLOBAL_ENTRY(vmx_asm_thash)
     adds r30=vmx_asm_thash_back1-asm_mov_from_reg,r20
     shladd r17=r17,4,r20	// get addr of MOVE_FROM_REG(r17)
     adds r16=IA64_VPD_BASE_OFFSET,r21	// get vcpu.arch.priveregs
-    ;;
-    mov r24=b0
+    mov r24=b0			// save b0
     ;;
     ld8 r16=[r16]		// get VPD addr
     mov b0=r17
@@ -442,20 +553,25 @@ GLOBAL_ENTRY(vmx_asm_thash)
     ;;                                                     
 vmx_asm_thash_back1:
     shr.u r23=r19,61		// get RR number
-    adds r25=VCPU_VRR0_OFS,r21	// get vcpu->arch.arch_vmx.vrr[0]'s addr
+    adds r28=VCPU_VRR0_OFS,r21	// get vcpu->arch.arch_vmx.vrr[0]'s addr
     adds r16=IA64_VPD_VPTA_OFFSET,r16	// get vpta 
     ;;
-    shladd r27=r23,3,r25	// get vcpu->arch.arch_vmx.vrr[r23]'s addr
+    shladd r27=r23,3,r28	// get vcpu->arch.arch_vmx.vrr[r23]'s addr
     ld8 r17=[r16]		// get PTA
     mov r26=1
     ;;
     extr.u r29=r17,2,6		// get pta.size
-    ld8 r25=[r27]		// get vcpu->arch.arch_vmx.vrr[r23]'s value
+    ld8 r28=[r27]		// get vcpu->arch.arch_vmx.vrr[r23]'s value
+    mov b0=r24
     ;;
-    extr.u r25=r25,2,6		// get rr.ps
+    // Fall-back to C if VF (long format) is set
+    tbit.nz p6,p0=r17,8
+    (p6) mov r24 =EVENT_THASH
+    (p6) br.cond.dpnt.many vmx_virtualization_fault_back
+    extr.u r28=r28,2,6		// get rr.ps
     shl r22=r26,r29		// 1UL << pta.size
     ;;
-    shr.u r23=r19,r25		// vaddr >> rr.ps
+    shr.u r23=r19,r28		// vaddr >> rr.ps
     adds r26=3,r29		// pta.size + 3 
     shl r27=r17,3		// pta << 3 
     ;;
@@ -473,7 +589,7 @@ vmx_asm_thash_back1:
     ;;
     or r19=r19,r22		// calc pval
     shladd r17=r18,4,r26
-    adds r30=vmx_resume_to_guest-asm_mov_from_reg,r20
+    adds r30=vmx_resume_to_guest2-asm_mov_from_reg,r20
     ;;
     mov b0=r17
     br.many b0
@@ -594,6 +710,8 @@ MOV_FROM_BANK0_REG(31)
 
 
 // mov from reg table
+// r19:		value, r30: return address
+// r26 may be destroyed
 ENTRY(asm_mov_from_reg)
     MOV_FROM_REG(0)
     MOV_FROM_REG(1)
@@ -732,6 +850,18 @@ END(asm_mov_from_reg)
  * r24: b0
  */
 ENTRY(vmx_resume_to_guest)
+    adds r19=IA64_VPD_BASE_OFFSET,r21
+    mov r16 = r31
+    mov r17 = r24
+    ;;
+    ld8 r25 =[r19]
+    movl r24 = 1f
+    br.sptk.many vmx_vps_sync_write
+1:
+    mov r31 = r16
+    mov r24 =r17
+    ;;
+vmx_resume_to_guest2:
     mov r16=cr.ipsr
     movl r20=__vsa_base
     ;;
diff -Naurp xen/arch/ia64/vmx/vacpi.c xen-redhat/arch/ia64/vmx/vacpi.c
--- xen/arch/ia64/vmx/vacpi.c
+++ xen-redhat/arch/ia64/vmx/vacpi.c
@@ -0,0 +1,179 @@
+/*
+ * vacpi.c: emulation of the ACPI
+ * based on x86 hvm/pmtimer.c
+ *
+ * Copyright (c) 2007, FUJITSU LIMITED
+ *      Kouya Shimura <kouya at jp fujitsu com>
+ *
+ * Copyright (c) 2007, XenSource inc.
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/vmx_vcpu.h>
+#include <asm/vmx.h>
+#include <asm/hvm/vacpi.h>
+
+/* The interesting bits of the PM1a_STS register */
+#define TMR_STS    (1 << 0)
+#define PWRBTN_STS (1 << 5)
+#define GBL_STS    (1 << 8)
+
+/* The same in PM1a_EN */
+#define TMR_EN     (1 << 0)
+#define PWRBTN_EN  (1 << 5)
+#define GBL_EN     (1 << 8)
+
+/* Mask of bits in PM1a_STS that can generate an SCI.  Although the ACPI
+ * spec lists other bits, the PIIX4, which we are emulating, only
+ * supports these three.  For now, we only use TMR_STS; in future we
+ * will let qemu set the other bits */
+#define SCI_MASK (TMR_STS|PWRBTN_STS|GBL_STS)
+
+/* SCI IRQ number (must match SCI_INT number in ACPI FADT in hvmloader) */
+#define SCI_IRQ 9
+
+/* We provide a 32-bit counter (must match the TMR_VAL_EXT bit in the FADT) */
+#define TMR_VAL_MASK  (0xffffffff)
+#define TMR_VAL_MSB   (0x80000000)
+
+/* Dispatch SCIs based on the PM1a_STS and PM1a_EN registers */
+static void pmt_update_sci(struct domain *d, struct vacpi *s)
+{
+	if (s->regs.pm1a_en & s->regs.pm1a_sts & SCI_MASK)
+		viosapic_set_irq(d, SCI_IRQ, 1);  /* Assert */
+	else
+		viosapic_set_irq(d, SCI_IRQ, 0);
+}
+
+/* Set the correct value in the timer, accounting for time elapsed
+ * since the last time we did that. */
+static void pmt_update_time(struct domain *d)
+{
+	struct vacpi *s = &d->arch.hvm_domain.vacpi;
+	s_time_t curr_gtime;
+	unsigned long delta;
+	uint32_t msb = s->regs.tmr_val & TMR_VAL_MSB;
+
+	/* Update the timer */
+	curr_gtime = NOW();
+	delta = curr_gtime - s->last_gtime;
+	delta = ((delta >> 8) * ((FREQUENCE_PMTIMER << 32) / SECONDS(1))) >> 24;
+	s->regs.tmr_val += delta;
+	s->regs.tmr_val &= TMR_VAL_MASK;
+	s->last_gtime = curr_gtime;
+
+	/* If the counter's MSB has changed, set the status bit */
+	if ((s->regs.tmr_val & TMR_VAL_MSB) != msb) {
+		s->regs.pm1a_sts |= TMR_STS;
+		pmt_update_sci(d, s);
+	}
+}
+
+/* This function should be called soon after each time the MSB of the
+ * pmtimer register rolls over, to make sure we update the status
+ * registers and SCI at least once per rollover */
+static void pmt_timer_callback(void *opaque)
+{
+	struct domain *d = opaque;
+	struct vacpi *s = &d->arch.hvm_domain.vacpi;
+	uint64_t cycles, time_flip;
+
+	/* Recalculate the timer and make sure we get an SCI if we need one */
+	pmt_update_time(d);
+
+	/* How close are we to the next MSB flip? */
+	cycles = TMR_VAL_MSB - (s->regs.tmr_val & (TMR_VAL_MSB - 1));
+
+	/* Overall time between MSB flips */
+	time_flip = (((SECONDS(1) << 23) / FREQUENCE_PMTIMER) * cycles) >> 23;
+
+	/* Wake up again near the next bit-flip */
+	set_timer(&s->timer, NOW() + time_flip + MILLISECS(1));
+}
+
+int vacpi_intercept(ioreq_t * iop, u64 * val)
+{
+	struct domain *d = current->domain;
+	struct vacpi *s = &d->arch.hvm_domain.vacpi;
+	uint64_t addr_off = iop->addr - ACPI_PM1A_EVT_BLK_ADDRESS;
+
+	if (addr_off < 4) {	/* Access to PM1a_STS and PM1a_EN registers */
+		void *p = (void *)&s->regs.evt_blk + addr_off;
+
+		if (iop->dir == 1) {	/* Read */
+			if (iop->size == 1)
+				*val = *(uint8_t *) p;
+			else if (iop->size == 2)
+				*val = *(uint16_t *) p;
+			else if (iop->size == 4)
+				*val = *(uint32_t *) p;
+			else
+				panic_domain(NULL, "wrong ACPI "
+					     "PM1A_EVT_BLK access\n");
+		} else {	/* Write */
+			uint8_t *sp = (uint8_t *) & iop->data;
+			int i;
+
+			for (i = 0; i < iop->size; i++, addr_off++, p++, sp++) {
+				if (addr_off < 2) /* PM1a_STS */
+					/* write-to-clear */
+					*(uint8_t *) p &= ~*sp;
+				else /* PM1a_EN */
+					*(uint8_t *) p = *sp;
+			}
+			/* Fix the SCI state to match the new register state */
+			pmt_update_sci(d, s);
+		}
+
+		iop->state = STATE_IORESP_READY;
+		vmx_io_assist(current);
+		return 1;
+	}
+
+	if (iop->addr == ACPI_PM_TMR_BLK_ADDRESS) {
+		if (iop->size != 4)
+			panic_domain(NULL, "wrong ACPI PM timer access\n");
+		if (iop->dir == 1) {	/* Read */
+			pmt_update_time(d);
+			*val = s->regs.tmr_val;
+		}
+		/* PM_TMR_BLK is read-only */
+		iop->state = STATE_IORESP_READY;
+		vmx_io_assist(current);
+		return 1;
+	}
+
+	return 0;
+}
+
+void vacpi_init(struct domain *d)
+{
+	struct vacpi *s = &d->arch.hvm_domain.vacpi;
+
+	s->regs.tmr_val = 0;
+	s->regs.evt_blk = 0;
+	s->last_gtime = NOW();
+
+	/* Set up callback to fire SCIs when the MSB of TMR_VAL changes */
+	init_timer(&s->timer, pmt_timer_callback, d, first_cpu(cpu_online_map));
+	pmt_timer_callback(d);
+}
+
+void vacpi_relinquish_resources(struct domain *d)
+{
+	struct vacpi *s = &d->arch.hvm_domain.vacpi;
+	kill_timer(&s->timer);
+}
diff -Naurp xen/arch/ia64/vmx/viosapic.c xen-redhat/arch/ia64/vmx/viosapic.c
--- xen/arch/ia64/vmx/viosapic.c
+++ xen-redhat/arch/ia64/vmx/viosapic.c
@@ -23,6 +23,10 @@
  *
  *  Yunhong Jiang <yunhong.jiang@intel.com>
  *  Ported to xen by using virtual IRQ line.
+ * 
+ *  Copyright (C) 2007 VA Linux Systems Japan K.K.
+ *  Isaku Yamahata <yamahata at valinux co jp>
+ *  SMP support
  */
 
 #include <xen/config.h>
@@ -44,6 +48,7 @@ static void viosapic_deliver(struct vios
     uint8_t vector = viosapic->redirtbl[irq].vector;
     struct vcpu *v;
 
+    ASSERT(spin_is_locked(&viosapic->lock));
     switch ( delivery_mode )
     {
     case SAPIC_FIXED:
@@ -90,6 +95,7 @@ static int get_redir_num(struct viosapic
 {
     int i;
 
+    ASSERT(spin_is_locked(&viosapic->lock));
     for ( i = 0; i < VIOSAPIC_NUM_PINS; i++ )
         if ( viosapic->redirtbl[i].vector == vector )
             return i;
@@ -118,19 +124,24 @@ static void viosapic_update_EOI(struct v
 {
     int redir_num;
 
+    spin_lock(&viosapic->lock);
     if ( (redir_num = get_redir_num(viosapic, vector)) == -1 )
     {
+        spin_unlock(&viosapic->lock);
         gdprintk(XENLOG_WARNING, "Can't find redir item for %d EOI\n", vector);
         return;
     }
 
     if ( !test_and_clear_bit(redir_num, &viosapic->isr) )
     {
-        gdprintk(XENLOG_WARNING, "redir %d not set for %d EOI\n",
-                 redir_num, vector);
+        spin_unlock(&viosapic->lock);
+        if ( viosapic->redirtbl[redir_num].trig_mode == SAPIC_LEVEL )
+            gdprintk(XENLOG_WARNING, "redir %d not set for %d EOI\n",
+                     redir_num, vector);
         return;
     }
     service_iosapic(viosapic);
+    spin_unlock(&viosapic->lock);
 }
 
 
@@ -149,18 +160,21 @@ static unsigned long viosapic_read_indir
 
     default:
     {
-        uint32_t redir_index = (viosapic->ioregsel - 0x10) >> 1;
+        /* ioregsel might be written at the same time. copy it before use. */
+        uint32_t ioregsel = viosapic->ioregsel;
+        uint32_t redir_index;
         uint64_t redir_content;
 
+        redir_index = (ioregsel - 0x10) >> 1;
         if ( redir_index >= VIOSAPIC_NUM_PINS )
         {
             gdprintk(XENLOG_WARNING, "viosapic_read_indirect:undefined "
-                     "ioregsel %x\n", viosapic->ioregsel);
+                     "ioregsel %x\n", ioregsel);
             break;
         }
 
         redir_content = viosapic->redirtbl[redir_index].bits;
-        result = (viosapic->ioregsel & 0x1) ?
+        result = (ioregsel & 0x1) ?
                  (redir_content >> 32) & 0xffffffff :
                  redir_content & 0xffffffff;
         break;
@@ -212,9 +226,12 @@ static void viosapic_write_indirect(stru
 
     default:
     {
-        uint32_t redir_index = (viosapic->ioregsel - 0x10) >> 1;
+        /* ioregsel might be written at the same time. copy it before use. */
+        uint32_t ioregsel = viosapic->ioregsel;
+        uint32_t redir_index;
         uint64_t redir_content;
 
+        redir_index = (ioregsel - 0x10) >> 1;
         if ( redir_index >= VIOSAPIC_NUM_PINS )
         {
             gdprintk(XENLOG_WARNING, "viosapic_write_indirect "
@@ -222,9 +239,10 @@ static void viosapic_write_indirect(stru
             break;
         }
 
+        spin_lock(&viosapic->lock);
         redir_content = viosapic->redirtbl[redir_index].bits;
 
-        if ( viosapic->ioregsel & 0x1 )
+        if ( ioregsel & 0x1 )
         {
             redir_content = (((uint64_t)val & 0xffffffff) << 32) |
                             (redir_content & 0xffffffff);
@@ -235,6 +253,7 @@ static void viosapic_write_indirect(stru
                             (val & 0xffffffff);
         }
         viosapic->redirtbl[redir_index].bits = redir_content;
+        spin_unlock(&viosapic->lock);
         break;
     }
     } /* switch */
diff -Naurp xen/arch/ia64/vmx/vlsapic.c xen-redhat/arch/ia64/vmx/vlsapic.c
--- xen/arch/ia64/vmx/vlsapic.c
+++ xen-redhat/arch/ia64/vmx/vlsapic.c
@@ -38,6 +38,7 @@
 #include <asm/vmx_platform.h>
 #include <asm/viosapic.h>
 #include <asm/vlsapic.h>
+#include <asm/vmx_phy_mode.h>
 #include <asm/linux/jiffies.h>
 #include <xen/domain.h>
 
@@ -517,8 +518,11 @@ void guest_write_eoi(VCPU *vcpu)
     int vec;
 
     vec = highest_inservice_irq(vcpu);
-    if ( vec == NULL_VECTOR ) 
-        panic_domain(vcpu_regs(vcpu), "Wrong vector to EOI\n");
+    if (vec == NULL_VECTOR) {
+        gdprintk(XENLOG_WARNING, "vcpu(%d): Wrong vector to EOI\n",
+                 vcpu->vcpu_id);
+        return;
+    }
     VLSAPIC_INSVC(vcpu,vec>>6) &= ~(1UL <<(vec&63));
     VCPU(vcpu, eoi)=0;    // overwrite the data
     vcpu->arch.irq_new_pending=1;
@@ -607,9 +611,8 @@ struct vcpu * vlsapic_lid_to_vcpu(struct
  * To inject INIT to guest, we must set the PAL_INIT entry 
  * and set psr to switch to physical mode
  */
-#define PAL_INIT_ENTRY 0x80000000ffffffa0
 #define PSR_SET_BITS (IA64_PSR_DT | IA64_PSR_IT | IA64_PSR_RT | \
-                      IA64_PSR_IC | IA64_PSR_RI)
+                      IA64_PSR_IC | IA64_PSR_RI | IA64_PSR_I | IA64_PSR_CPL)
 
 static void vmx_inject_guest_pal_init(VCPU *vcpu)
 {
@@ -771,6 +774,8 @@ static void vlsapic_write_xtp(struct vcp
     struct viosapic * viosapic;
     struct vcpu *lvcpu, *vcpu;
     viosapic = vcpu_viosapic(v); 
+
+    spin_lock(&viosapic->lock);
     lvcpu = viosapic->lowest_vcpu;
     VLSAPIC_XTP(v) = val;
     
@@ -783,6 +788,7 @@ static void vlsapic_write_xtp(struct vcp
         lvcpu = NULL;
 
     viosapic->lowest_vcpu = lvcpu;
+    spin_unlock(&viosapic->lock);
 }
 
 void vlsapic_write(struct vcpu *v,
diff -Naurp xen/arch/ia64/vmx/vmmu.c xen-redhat/arch/ia64/vmx/vmmu.c
--- xen/arch/ia64/vmx/vmmu.c
+++ xen-redhat/arch/ia64/vmx/vmmu.c
@@ -19,23 +19,48 @@
  *  Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com)
  *  Yaozu Dong (Eddie Dong) (Eddie.dong@intel.com)
  */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <asm/tlb.h>
-#include <asm/gcc_intrin.h>
-#include <asm/vcpu.h>
-#include <linux/interrupt.h>
 #include <asm/vmx_vcpu.h>
-#include <asm/vmx_mm_def.h>
-#include <asm/vmx.h>
-#include <asm/hw_irq.h>
 #include <asm/vmx_pal_vsa.h>
-#include <asm/kregs.h>
-#include <asm/vcpu.h>
-#include <xen/irq.h>
-#include <xen/errno.h>
 #include <xen/sched-if.h>
 
+static int default_vtlb_sz = DEFAULT_VTLB_SZ;
+static int default_vhpt_sz = DEFAULT_VHPT_SZ;
+
+static void __init parse_vtlb_size(char *s)
+{
+    int sz = parse_size_and_unit(s, NULL);
+
+    if (sz > 0) {
+        default_vtlb_sz = fls(sz - 1);
+        /* minimum 16KB (for tag uniqueness) */
+        if (default_vtlb_sz < 14)
+            default_vtlb_sz = 14;
+    }
+}
+
+static int canonicalize_vhpt_size(int sz)
+{
+    /* minimum 32KB */
+    if (sz < 15)
+        return 15;
+    /* maximum 8MB (since purging TR is hard coded) */
+    if (sz > IA64_GRANULE_SHIFT - 1)
+        return IA64_GRANULE_SHIFT - 1;
+    return sz;
+}
+
+static void __init parse_vhpt_size(char *s)
+{
+    int sz = parse_size_and_unit(s, NULL);
+    if (sz > 0) {
+        default_vhpt_sz = fls(sz - 1);
+        default_vhpt_sz = canonicalize_vhpt_size(default_vhpt_sz);
+    }
+}
+
+custom_param("vti_vtlb_size", parse_vtlb_size);
+custom_param("vti_vhpt_size", parse_vhpt_size);
+
 /*
  * Get the machine page frame number in 16KB unit
  * Input:
@@ -89,6 +114,7 @@ void recycle_message(thash_cb_t *hcb, u6
 }
  */
 
+#if 0
 /*
  * Purge all guest TCs in logical processor.
  * Instead of purging all LP TCs, we should only purge   
@@ -129,69 +155,37 @@ purge_machine_tc_by_domid(domid_t domid)
     // purge all TCs belong to this guest.
 #endif
 }
+#endif
 
 static int init_domain_vhpt(struct vcpu *v)
 {
-    struct page_info *page;
-    void * vbase;
-    page = alloc_domheap_pages (NULL, VCPU_VHPT_ORDER, 0);
-    if ( page == NULL ) {
-        printk("No enough contiguous memory for init_domain_vhpt\n");
-        return -ENOMEM;
-    }
-    vbase = page_to_virt(page);
-    memset(vbase, 0, VCPU_VHPT_SIZE);
-    printk(XENLOG_DEBUG "Allocate domain vhpt at 0x%p\n", vbase);
-    
-    VHPT(v,hash) = vbase;
-    VHPT(v,hash_sz) = VCPU_VHPT_SIZE/2;
-    VHPT(v,cch_buf) = (void *)((u64)vbase + VHPT(v,hash_sz));
-    VHPT(v,cch_sz) = VCPU_VHPT_SIZE - VHPT(v,hash_sz);
-    thash_init(&(v->arch.vhpt),VCPU_VHPT_SHIFT-1);
-    v->arch.arch_vmx.mpta = v->arch.vhpt.pta.val;
+    int rc;
 
-    return 0;
+    rc = thash_alloc(&(v->arch.vhpt), default_vhpt_sz, "vhpt");
+    v->arch.arch_vmx.mpta = v->arch.vhpt.pta.val;
+    return rc;
 }
 
 
 static void free_domain_vhpt(struct vcpu *v)
 {
-    struct page_info *page;
-
-    if (v->arch.vhpt.hash) {
-        page = virt_to_page(v->arch.vhpt.hash);
-        free_domheap_pages(page, VCPU_VHPT_ORDER);
-        v->arch.vhpt.hash = 0;
-    }
-
-    return;
+    if (v->arch.vhpt.hash)
+        thash_free(&(v->arch.vhpt));
 }
 
 int init_domain_tlb(struct vcpu *v)
 {
-    struct page_info *page;
-    void * vbase;
     int rc;
 
     rc = init_domain_vhpt(v);
     if (rc)
         return rc;
 
-    page = alloc_domheap_pages (NULL, VCPU_VTLB_ORDER, 0);
-    if ( page == NULL ) {
-        printk("No enough contiguous memory for init_domain_tlb\n");
+    rc = thash_alloc(&(v->arch.vtlb), default_vtlb_sz, "vtlb");
+    if (rc) {
         free_domain_vhpt(v);
-        return -ENOMEM;
+        return rc;
     }
-    vbase = page_to_virt(page);
-    memset(vbase, 0, VCPU_VTLB_SIZE);
-    printk(XENLOG_DEBUG "Allocate domain vtlb at 0x%p\n", vbase);
-    
-    VTLB(v,hash) = vbase;
-    VTLB(v,hash_sz) = VCPU_VTLB_SIZE/2;
-    VTLB(v,cch_buf) = (void *)((u64)vbase + VTLB(v,hash_sz));
-    VTLB(v,cch_sz) = VCPU_VTLB_SIZE - VTLB(v,hash_sz);
-    thash_init(&(v->arch.vtlb),VCPU_VTLB_SHIFT-1);
     
     return 0;
 }
@@ -199,12 +193,8 @@ int init_domain_tlb(struct vcpu *v)
 
 void free_domain_tlb(struct vcpu *v)
 {
-    struct page_info *page;
-
-    if ( v->arch.vtlb.hash) {
-        page = virt_to_page(v->arch.vtlb.hash);
-        free_domheap_pages(page, VCPU_VTLB_ORDER);
-    }
+    if (v->arch.vtlb.hash)
+        thash_free(&(v->arch.vtlb));
 
     free_domain_vhpt(v);
 }
@@ -252,41 +242,9 @@ void machine_tlb_insert(struct vcpu *v, 
  */
 void machine_tlb_purge(u64 va, u64 ps)
 {
-//    u64       psr;
-//    psr = ia64_clear_ic();
     ia64_ptcl(va, ps << 2);
-//    ia64_set_psr(psr);
-//    ia64_srlz_i();
-//    return;
-}
-/*
-u64 machine_thash(u64 va)
-{
-    return ia64_thash(va);
-}
-
-u64 machine_ttag(u64 va)
-{
-    return ia64_ttag(va);
-}
-*/
-thash_data_t * vsa_thash(PTA vpta, u64 va, u64 vrr, u64 *tag)
-{
-    u64 index,pfn,rid,pfn_bits;
-    pfn_bits = vpta.size-5-8;
-    pfn = REGION_OFFSET(va)>>_REGION_PAGE_SIZE(vrr);
-    rid = _REGION_ID(vrr);
-    index = ((rid&0xff)<<pfn_bits)|(pfn&((1UL<<pfn_bits)-1));
-    *tag = ((rid>>8)&0xffff) | ((pfn >>pfn_bits)<<16);
-    return (thash_data_t *)((vpta.base<<PTA_BASE_SHIFT)+(index<<5));
-//    return ia64_call_vsa(PAL_VPS_THASH,va,vrr,vpta,0,0,0,0);
 }
 
-//u64 vsa_ttag(u64 va, u64 vrr)
-//{
-//    return ia64_call_vsa(PAL_VPS_TTAG,va,vrr,0,0,0,0,0);
-//}
-
 int vhpt_enabled(VCPU *vcpu, uint64_t vadr, vhpt_ref_t ref)
 {
     ia64_rr  vrr;
@@ -544,8 +502,7 @@ IA64FAULT vmx_vcpu_ptc_e(VCPU *vcpu, u64
 
 IA64FAULT vmx_vcpu_ptc_g(VCPU *vcpu, u64 va, u64 ps)
 {
-    vmx_vcpu_ptc_ga(vcpu, va, ps);
-    return IA64_ILLOP_FAULT;
+    return vmx_vcpu_ptc_ga(vcpu, va, ps);
 }
 /*
 IA64FAULT vmx_vcpu_ptc_ga(VCPU *vcpu, u64 va, u64 ps)
diff -Naurp xen/arch/ia64/vmx/vmx_entry.S xen-redhat/arch/ia64/vmx/vmx_entry.S
--- xen/arch/ia64/vmx/vmx_entry.S
+++ xen-redhat/arch/ia64/vmx/vmx_entry.S
@@ -20,21 +20,9 @@
  *  Kun Tian (Kevin Tian) (kevin.tian@intel.com)
  */
 
-#ifndef VCPU_TLB_SHIFT
-#define VCPU_TLB_SHIFT	22
-#endif
 #include <linux/config.h>
 #include <asm/asmmacro.h>
-#include <asm/cache.h>
-#include <asm/kregs.h>
 #include <asm/offsets.h>
-#include <asm/pgtable.h>
-#include <asm/percpu.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/unistd.h>
-#include <asm/vhpt.h>
-#include <asm/vmmu.h>
 #include "vmx_minstate.h"
 
 GLOBAL_ENTRY(ia64_leave_nested)
@@ -373,20 +361,16 @@ vmx_rse_clear_invalid:
     adds r19=VPD(VPSR),r18
     ;;
     ld8 r19=[r19]        //vpsr
-    movl r20=__vsa_base
     ;;
 //vsa_sync_write_start
-    ld8 r20=[r20]       // read entry point
-    mov r25=r18
-    ;;
     movl r24=ia64_vmm_entry  // calculate return address
-    add r16=PAL_VPS_SYNC_WRITE,r20
-    ;;
-    mov b0=r16
-    br.cond.sptk b0         // call the service
+    mov r25=r18
+    br.sptk.many vmx_vps_sync_write        // call the service
     ;;
 END(ia64_leave_hypervisor)
 // fall through
+
+
 GLOBAL_ENTRY(ia64_vmm_entry)
 /*
  *  must be at bank 0
@@ -394,32 +378,18 @@ GLOBAL_ENTRY(ia64_vmm_entry)
  *  r17:cr.isr
  *  r18:vpd
  *  r19:vpsr
- *  r20:__vsa_base
  *  r22:b0
  *  r23:predicate
  */
     mov r24=r22
     mov r25=r18
     tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+    (p1) br.cond.sptk.few vmx_vps_resume_normal
+    (p2) br.cond.sptk.many vmx_vps_resume_handler
     ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p1) br.sptk.many ia64_vmm_entry_out
-    ;;
-    tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT		//p1=cr.isr.ir
-    ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p2) add r29=PAL_VPS_RESUME_HANDLER,r20
-    (p2) ld8 r26=[r25]
-    ;;
-ia64_vmm_entry_out:    
-    mov pr=r23,-2
-    mov b0=r29
-    ;;
-    br.cond.sptk b0             // call pal service
 END(ia64_vmm_entry)
 
 
-
 /*
  * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
  *  need to switch to bank 0 and doesn't restore the scratch registers.
@@ -719,7 +689,7 @@ GLOBAL_ENTRY(vmx_switch_rr7)
    movl r25=PAGE_KERNEL
    ;;
    or loc5 = r25,loc5          // construct PA | page properties
-   mov r23 = VCPU_VHPT_SHIFT <<2
+   mov r23 = IA64_GRANULE_SHIFT <<2
    ;;
    ptr.d   in3,r23
    ;;
diff -Naurp xen/arch/ia64/vmx/vmx_init.c xen-redhat/arch/ia64/vmx/vmx_init.c
--- xen/arch/ia64/vmx/vmx_init.c
+++ xen-redhat/arch/ia64/vmx/vmx_init.c
@@ -51,6 +51,8 @@
 #include <asm/viosapic.h>
 #include <xen/event.h>
 #include <asm/vlsapic.h>
+#include <asm/vmx_pal_vsa.h>
+#include <asm/patch.h>
 #include "entry.h"
 
 /* Global flag to identify whether Intel vmx feature is on */
@@ -62,6 +64,28 @@ static u64 vm_buffer = 0;	/* Buffer requ
 u64 __vsa_base = 0;	/* Run-time service base of VMX */
 
 /* Check whether vt feature is enabled or not. */
+
+void vmx_vps_patch(void)
+{
+	u64 addr;
+	
+	addr = (u64)&vmx_vps_sync_read;
+	ia64_patch_imm64(addr, __vsa_base+PAL_VPS_SYNC_READ);
+	ia64_fc((void *)addr);
+	addr = (u64)&vmx_vps_sync_write;
+	ia64_patch_imm64(addr, __vsa_base+PAL_VPS_SYNC_WRITE);
+	ia64_fc((void *)addr);
+	addr = (u64)&vmx_vps_resume_normal;
+	ia64_patch_imm64(addr, __vsa_base+PAL_VPS_RESUME_NORMAL);
+	ia64_fc((void *)addr);
+	addr = (u64)&vmx_vps_resume_handler;
+	ia64_patch_imm64(addr, __vsa_base+PAL_VPS_RESUME_HANDLER);
+	ia64_fc((void *)addr);
+	ia64_sync_i();
+	ia64_srlz_i();	
+}
+
+
 void
 identify_vmx_feature(void)
 {
@@ -130,8 +154,10 @@ vmx_init_env(void)
 		return ;
 	}
 
-	if (!__vsa_base)
+	if (!__vsa_base){
 		__vsa_base = tmp_base;
+		vmx_vps_patch();
+	}
 	else
 		ASSERT(tmp_base != __vsa_base);
 
@@ -220,14 +246,7 @@ vmx_create_vp(struct vcpu *v)
 void
 vmx_save_state(struct vcpu *v)
 {
-	u64 status;
-
-	/* FIXME: about setting of pal_proc_vector... time consuming */
-	status = ia64_pal_vp_save((u64 *)v->arch.privregs, 0);
-	if (status != PAL_STATUS_SUCCESS){
-		panic_domain(vcpu_regs(v),"Save vp status failed\n");
-	}
-
+	ia64_call_vsa(PAL_VPS_SAVE, (u64)v->arch.privregs, 1, 0, 0, 0, 0, 0);
 
 	/* Need to save KR when domain switch, though HV itself doesn;t
 	 * use them.
@@ -246,12 +265,7 @@ vmx_save_state(struct vcpu *v)
 void
 vmx_load_state(struct vcpu *v)
 {
-	u64 status;
-
-	status = ia64_pal_vp_restore((u64 *)v->arch.privregs, 0);
-	if (status != PAL_STATUS_SUCCESS){
-		panic_domain(vcpu_regs(v),"Restore vp status failed\n");
-	}
+	ia64_call_vsa(PAL_VPS_RESTORE, (u64)v->arch.privregs, 1, 0, 0, 0, 0, 0);
 
 	ia64_set_kr(0, v->arch.arch_vmx.vkr[0]);
 	ia64_set_kr(1, v->arch.arch_vmx.vkr[1]);
@@ -350,6 +364,8 @@ vmx_relinquish_guest_resources(struct do
 
 	for_each_vcpu(d, v)
 		vmx_release_assist_channel(v);
+
+	vacpi_relinquish_resources(d);
 }
 
 void
@@ -418,6 +434,8 @@ void vmx_setup_platform(struct domain *d
 
 	/* Initialize iosapic model within hypervisor */
 	viosapic_init(d);
+
+	vacpi_init(d);
 }
 
 void vmx_do_launch(struct vcpu *v)
diff -Naurp xen/arch/ia64/vmx/vmx_init.c.orig xen-redhat/arch/ia64/vmx/vmx_init.c.orig
--- xen/arch/ia64/vmx/vmx_init.c.orig
+++ xen-redhat/arch/ia64/vmx/vmx_init.c.orig
@@ -0,0 +1,426 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
+/*
+ * vmx_init.c: initialization work for vt specific domain
+ * Copyright (c) 2005, Intel Corporation.
+ *	Kun Tian (Kevin Tian) <kevin.tian@intel.com>
+ *	Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ *	Fred Yang <fred.yang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+/*
+ * 05/08/16 Kun tian (Kevin Tian) <kevin.tian@intel.com>:
+ * Disable doubling mapping
+ *
+ * 05/03/23 Kun Tian (Kevin Tian) <kevin.tian@intel.com>:
+ * Simplied design in first step:
+ *	- One virtual environment
+ *	- Domain is bound to one LP
+ * Later to support guest SMP:
+ *	- Need interface to handle VP scheduled to different LP
+ */
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/sched.h>
+#include <asm/pal.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/vmx_vcpu.h>
+#include <xen/lib.h>
+#include <asm/vmmu.h>
+#include <public/xen.h>
+#include <public/hvm/ioreq.h>
+#include <public/event_channel.h>
+#include <asm/vmx_phy_mode.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+#include <xen/mm.h>
+#include <asm/viosapic.h>
+#include <xen/event.h>
+#include <asm/vlsapic.h>
+#include "entry.h"
+
+/* Global flag to identify whether Intel vmx feature is on */
+u32 vmx_enabled = 0;
+static u32 vm_order;
+static u64 buffer_size;
+static u64 vp_env_info;
+static u64 vm_buffer = 0;	/* Buffer required to bring up VMX feature */
+u64 __vsa_base = 0;	/* Run-time service base of VMX */
+
+/* Check whether vt feature is enabled or not. */
+void
+identify_vmx_feature(void)
+{
+	pal_status_t ret;
+	u64 avail = 1, status = 1, control = 1;
+
+	vmx_enabled = 0;
+	/* Check VT-i feature */
+	ret = ia64_pal_proc_get_features(&avail, &status, &control);
+	if (ret != PAL_STATUS_SUCCESS) {
+		printk("Get proc features failed.\n");
+		goto no_vti;
+	}
+
+	/* FIXME: do we need to check status field, to see whether
+	 * PSR.vm is actually enabled? If yes, aonther call to
+	 * ia64_pal_proc_set_features may be reuqired then.
+	 */
+	printk("avail:0x%lx, status:0x%lx,control:0x%lx, vm?0x%lx\n",
+		avail, status, control, avail & PAL_PROC_VM_BIT);
+	if (!(avail & PAL_PROC_VM_BIT)) {
+		printk("No VT feature supported.\n");
+		goto no_vti;
+	}
+
+	ret = ia64_pal_vp_env_info(&buffer_size, &vp_env_info);
+	if (ret != PAL_STATUS_SUCCESS) {
+		printk("Get vp environment info failed.\n");
+		goto no_vti;
+	}
+
+	/* Does xen has ability to decode itself? */
+	if (!(vp_env_info & VP_OPCODE))
+		printk("WARNING: no opcode provided from hardware(%lx)!!!\n", vp_env_info);
+	vm_order = get_order(buffer_size);
+	printk("vm buffer size: %ld, order: %d\n", buffer_size, vm_order);
+
+	vmx_enabled = 1;
+no_vti:
+	return;
+}
+
+/*
+ * Init virtual environment on current LP
+ * vsa_base is the indicator whether it's first LP to be initialized
+ * for current domain.
+ */ 
+void
+vmx_init_env(void)
+{
+	u64 status, tmp_base;
+
+	if (!vm_buffer) {
+		vm_buffer = (unsigned long)alloc_xenheap_pages(vm_order);
+		ASSERT(vm_buffer);
+		printk("vm_buffer: 0x%lx\n", vm_buffer);
+	}
+
+	status=ia64_pal_vp_init_env(__vsa_base ? VP_INIT_ENV : VP_INIT_ENV_INITALIZE,
+				    __pa(vm_buffer),
+				    vm_buffer,
+				    &tmp_base);
+
+	if (status != PAL_STATUS_SUCCESS) {
+		printk("ia64_pal_vp_init_env failed.\n");
+		return ;
+	}
+
+	if (!__vsa_base)
+		__vsa_base = tmp_base;
+	else
+		ASSERT(tmp_base != __vsa_base);
+
+}
+
+typedef union {
+	u64 value;
+	struct {
+		u64 number : 8;
+		u64 revision : 8;
+		u64 model : 8;
+		u64 family : 8;
+		u64 archrev : 8;
+		u64 rv : 24;
+	};
+} cpuid3_t;
+
+/* Allocate vpd from xenheap */
+static vpd_t *alloc_vpd(void)
+{
+	int i;
+	cpuid3_t cpuid3;
+	vpd_t *vpd;
+	mapped_regs_t *mregs;
+
+	vpd = alloc_xenheap_pages(get_order(VPD_SIZE));
+	if (!vpd) {
+		printk("VPD allocation failed.\n");
+		return NULL;
+	}
+
+	printk(XENLOG_DEBUG "vpd base: 0x%p, vpd size:%ld\n",
+	       vpd, sizeof(vpd_t));
+	memset(vpd, 0, VPD_SIZE);
+	mregs = &vpd->vpd_low;
+
+	/* CPUID init */
+	for (i = 0; i < 5; i++)
+		mregs->vcpuid[i] = ia64_get_cpuid(i);
+
+	/* Limit the CPUID number to 5 */
+	cpuid3.value = mregs->vcpuid[3];
+	cpuid3.number = 4;	/* 5 - 1 */
+	mregs->vcpuid[3] = cpuid3.value;
+
+	mregs->vac.a_from_int_cr = 1;
+	mregs->vac.a_to_int_cr = 1;
+	mregs->vac.a_from_psr = 1;
+	mregs->vac.a_from_cpuid = 1;
+	mregs->vac.a_cover = 1;
+	mregs->vac.a_bsw = 1;
+	mregs->vac.a_int = 1;
+	mregs->vdc.d_vmsw = 1;
+
+	return vpd;
+}
+
+/* Free vpd to xenheap */
+static void
+free_vpd(struct vcpu *v)
+{
+	if ( v->arch.privregs )
+		free_xenheap_pages(v->arch.privregs, get_order(VPD_SIZE));
+}
+
+/*
+ * Create a VP on intialized VMX environment.
+ */
+static void
+vmx_create_vp(struct vcpu *v)
+{
+	u64 ret;
+	vpd_t *vpd = (vpd_t *)v->arch.privregs;
+	u64 ivt_base;
+	extern char vmx_ia64_ivt;
+	/* ia64_ivt is function pointer, so need this tranlation */
+	ivt_base = (u64) &vmx_ia64_ivt;
+	printk(XENLOG_DEBUG "ivt_base: 0x%lx\n", ivt_base);
+	ret = ia64_pal_vp_create((u64 *)vpd, (u64 *)ivt_base, 0);
+	if (ret != PAL_STATUS_SUCCESS){
+		panic_domain(vcpu_regs(v),"ia64_pal_vp_create failed. \n");
+	}
+}
+
+/* Other non-context related tasks can be done in context switch */
+void
+vmx_save_state(struct vcpu *v)
+{
+	u64 status;
+
+	/* FIXME: about setting of pal_proc_vector... time consuming */
+	status = ia64_pal_vp_save((u64 *)v->arch.privregs, 0);
+	if (status != PAL_STATUS_SUCCESS){
+		panic_domain(vcpu_regs(v),"Save vp status failed\n");
+	}
+
+
+	/* Need to save KR when domain switch, though HV itself doesn;t
+	 * use them.
+	 */
+	v->arch.arch_vmx.vkr[0] = ia64_get_kr(0);
+	v->arch.arch_vmx.vkr[1] = ia64_get_kr(1);
+	v->arch.arch_vmx.vkr[2] = ia64_get_kr(2);
+	v->arch.arch_vmx.vkr[3] = ia64_get_kr(3);
+	v->arch.arch_vmx.vkr[4] = ia64_get_kr(4);
+	v->arch.arch_vmx.vkr[5] = ia64_get_kr(5);
+	v->arch.arch_vmx.vkr[6] = ia64_get_kr(6);
+	v->arch.arch_vmx.vkr[7] = ia64_get_kr(7);
+}
+
+/* Even guest is in physical mode, we still need such double mapping */
+void
+vmx_load_state(struct vcpu *v)
+{
+	u64 status;
+
+	status = ia64_pal_vp_restore((u64 *)v->arch.privregs, 0);
+	if (status != PAL_STATUS_SUCCESS){
+		panic_domain(vcpu_regs(v),"Restore vp status failed\n");
+	}
+
+	ia64_set_kr(0, v->arch.arch_vmx.vkr[0]);
+	ia64_set_kr(1, v->arch.arch_vmx.vkr[1]);
+	ia64_set_kr(2, v->arch.arch_vmx.vkr[2]);
+	ia64_set_kr(3, v->arch.arch_vmx.vkr[3]);
+	ia64_set_kr(4, v->arch.arch_vmx.vkr[4]);
+	ia64_set_kr(5, v->arch.arch_vmx.vkr[5]);
+	ia64_set_kr(6, v->arch.arch_vmx.vkr[6]);
+	ia64_set_kr(7, v->arch.arch_vmx.vkr[7]);
+	/* Guest vTLB is not required to be switched explicitly, since
+	 * anchored in vcpu */
+}
+
+static void vmx_create_event_channels(struct vcpu *v)
+{
+	vcpu_iodata_t *p;
+	struct vcpu *o;
+
+	if (v->vcpu_id == 0) {
+		/* Ugly: create event channels for every vcpu when vcpu 0
+		   starts, so that they're available for ioemu to bind to. */
+		for_each_vcpu(v->domain, o) {
+			p = get_vio(v->domain, o->vcpu_id);
+			o->arch.arch_vmx.xen_port = p->vp_eport =
+					alloc_unbound_xen_event_channel(o, 0);
+			gdprintk(XENLOG_INFO, "Allocated port %ld for hvm.\n",
+			         o->arch.arch_vmx.xen_port);
+		}
+	}
+}
+
+/*
+ * Event channel has destoryed in domain_kill(), so we needn't
+ * do anything here
+ */
+static void vmx_release_assist_channel(struct vcpu *v)
+{
+	return;
+}
+
+/*
+ * Initialize VMX envirenment for guest. Only the 1st vp/vcpu
+ * is registered here.
+ */
+int
+vmx_final_setup_guest(struct vcpu *v)
+{
+	vpd_t *vpd;
+	int rc;
+	struct switch_stack *sw;
+
+	vpd = alloc_vpd();
+	ASSERT(vpd);
+	if (!vpd)
+		return -ENOMEM;
+
+	v->arch.privregs = (mapped_regs_t *)vpd;
+	vcpu_share_privregs_with_guest(v);
+	vpd->vpd_low.virt_env_vaddr = vm_buffer;
+
+	/* Per-domain vTLB and vhpt implementation. Now vmx domain will stick
+	 * to this solution. Maybe it can be deferred until we know created
+	 * one as vmx domain */
+#ifndef HASH_VHPT
+	rc = init_domain_tlb(v);
+	if (rc)
+		return rc;
+#endif
+	vmx_create_event_channels(v);
+
+	/* v->arch.schedule_tail = arch_vmx_do_launch; */
+	vmx_create_vp(v);
+
+	/* Physical mode emulation initialization, including
+	* emulation ID allcation and related memory request
+	*/
+	physical_mode_init(v);
+
+	vlsapic_reset(v);
+	vtm_init(v);
+
+	/* Set up guest 's indicator for VTi domain*/
+	set_bit(ARCH_VMX_DOMAIN, &v->arch.arch_vmx.flags);
+
+	/* Initialize pNonSys=1 for the first context switching */
+	sw = (struct switch_stack *)vcpu_regs(v) - 1;
+	sw->pr = (1UL << PRED_NON_SYSCALL);
+
+	return 0;
+}
+
+void
+vmx_relinquish_guest_resources(struct domain *d)
+{
+	struct vcpu *v;
+
+	for_each_vcpu(d, v)
+		vmx_release_assist_channel(v);
+}
+
+void
+vmx_relinquish_vcpu_resources(struct vcpu *v)
+{
+	vtime_t *vtm = &(v->arch.arch_vmx.vtm);
+
+	kill_timer(&vtm->vtm_timer);
+
+	free_domain_tlb(v);
+	free_vpd(v);
+}
+
+typedef struct io_range {
+	unsigned long start;
+	unsigned long size;
+	unsigned long type;
+} io_range_t;
+
+static const io_range_t io_ranges[] = {
+	{VGA_IO_START, VGA_IO_SIZE, GPFN_FRAME_BUFFER},
+	{MMIO_START, MMIO_SIZE, GPFN_LOW_MMIO},
+	{LEGACY_IO_START, LEGACY_IO_SIZE, GPFN_LEGACY_IO},
+	{IO_SAPIC_START, IO_SAPIC_SIZE, GPFN_IOSAPIC},
+	{PIB_START, PIB_SIZE, GPFN_PIB},
+};
+
+// The P2M table is built in libxc/ia64/xc_ia64_hvm_build.c @ setup_guest()
+// so only mark IO memory space here
+static void vmx_build_io_physmap_table(struct domain *d)
+{
+	unsigned long i, j;
+
+	/* Mark I/O ranges */
+	for (i = 0; i < (sizeof(io_ranges) / sizeof(io_range_t)); i++) {
+		for (j = io_ranges[i].start;
+		     j < io_ranges[i].start + io_ranges[i].size; j += PAGE_SIZE)
+			(void)__assign_domain_page(d, j, io_ranges[i].type,
+			                           ASSIGN_writable);
+	}
+
+}
+
+void vmx_setup_platform(struct domain *d)
+{
+	ASSERT(d != dom0); /* only for non-privileged vti domain */
+
+	vmx_build_io_physmap_table(d);
+
+	d->arch.vmx_platform.shared_page_va =
+		(unsigned long)__va(__gpa_to_mpa(d, IO_PAGE_START));
+	/* For buffered IO requests. */
+	spin_lock_init(&d->arch.hvm_domain.buffered_io_lock);
+	d->arch.hvm_domain.buffered_io_va =
+		(unsigned long)__va(__gpa_to_mpa(d, BUFFER_IO_PAGE_START));
+	d->arch.hvm_domain.buffered_pio_va =
+		(unsigned long)__va(__gpa_to_mpa(d, BUFFER_PIO_PAGE_START));
+	/* TEMP */
+	d->arch.vmx_platform.pib_base = 0xfee00000UL;
+
+	d->arch.sal_data = xmalloc(struct xen_sal_data);
+
+	/* Only open one port for I/O and interrupt emulation */
+	memset(&d->shared_info->evtchn_mask[0], 0xff,
+	       sizeof(d->shared_info->evtchn_mask));
+
+	/* Initialize iosapic model within hypervisor */
+	viosapic_init(d);
+}
+
+void vmx_do_launch(struct vcpu *v)
+{
+	vmx_load_all_rr(v);
+}
diff -Naurp xen/arch/ia64/vmx/vmx_ivt.S xen-redhat/arch/ia64/vmx/vmx_ivt.S
--- xen/arch/ia64/vmx/vmx_ivt.S
+++ xen-redhat/arch/ia64/vmx/vmx_ivt.S
@@ -208,11 +208,8 @@ vmx_itlb_loop:
     ld8 r18=[r16]
     ;;
     adds r19=VPD(VPSR),r18
-    movl r20=__vsa_base
     ;;
     ld8 r19=[r19]
-    ld8 r20=[r20]
-    ;;
     br.sptk ia64_vmm_entry
     ;;
 vmx_itlb_out:
@@ -289,11 +286,8 @@ vmx_dtlb_loop:
     ld8 r18=[r16]
     ;;
     adds r19=VPD(VPSR),r18
-    movl r20=__vsa_base
     ;;
     ld8 r19=[r19]
-    ld8 r20=[r20]
-    ;;
     br.sptk ia64_vmm_entry
     ;;
 vmx_dtlb_out:
@@ -1011,7 +1005,7 @@ END(vmx_speculation_vector)
 // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
 ENTRY(vmx_debug_vector)
     VMX_DBG_FAULT(29)
-    VMX_FAULT(29)
+    VMX_REFLECT(29)
 END(vmx_debug_vector)
 
     .org vmx_ia64_ivt+0x5a00
diff -Naurp xen/arch/ia64/vmx/vmx_minstate.h xen-redhat/arch/ia64/vmx/vmx_minstate.h
--- xen/arch/ia64/vmx/vmx_minstate.h
+++ xen-redhat/arch/ia64/vmx/vmx_minstate.h
@@ -59,24 +59,16 @@
 
 #define PAL_VSA_SYNC_READ           \
     /* begin to call pal vps sync_read */     \
-    add r25=IA64_VPD_BASE_OFFSET, r21;       \
-    movl r20=__vsa_base;     \
-    ;;          \
-    ld8 r25=[r25];      /* read vpd base */     \
-    ld8 r20=[r20];      /* read entry point */  \
-    ;;      \
-    add r20=PAL_VPS_SYNC_READ,r20;  \
-    ;;  \
 { .mii;  \
+    add r25=IA64_VPD_BASE_OFFSET, r21;       \
     nop 0x0;   \
     mov r24=ip;        \
-    mov b0=r20;     \
     ;;      \
 };           \
 { .mmb;      \
     add r24 = 0x20, r24;    \
-    nop 0x0;   	 \
-    br.cond.sptk b0;        /*  call the service */ \
+    ld8 r25 = [r25];   	 \
+    br.cond.sptk vmx_vps_sync_read;        /*  call the service */ \
     ;;              \
 };           \
 
diff -Naurp xen/arch/ia64/vmx/vmx_phy_mode.c xen-redhat/arch/ia64/vmx/vmx_phy_mode.c
--- xen/arch/ia64/vmx/vmx_phy_mode.c
+++ xen-redhat/arch/ia64/vmx/vmx_phy_mode.c
@@ -237,7 +237,12 @@ void
 switch_mm_mode(VCPU *vcpu, IA64_PSR old_psr, IA64_PSR new_psr)
 {
     int act;
-    act = mm_switch_action(old_psr, new_psr);
+    /* Switch to physical mode when injecting PAL_INIT */
+    if (unlikely(MODE_IND(new_psr) == 0 &&
+                 vcpu_regs(vcpu)->cr_iip == PAL_INIT_ENTRY))
+        act = SW_V2P;
+    else
+        act = mm_switch_action(old_psr, new_psr);
     perfc_incra(vmx_switch_mm_mode, act);
     switch (act) {
     case SW_V2P:
diff -Naurp xen/arch/ia64/vmx/vmx_process.c xen-redhat/arch/ia64/vmx/vmx_process.c
--- xen/arch/ia64/vmx/vmx_process.c
+++ xen-redhat/arch/ia64/vmx/vmx_process.c
@@ -76,7 +76,14 @@ static u64 vec2off[68] = {0x0,0x400,0x80
     0x7f00
 };
 
-
+void vmx_lazy_load_fpu(struct vcpu *vcpu)
+{
+    if (FP_PSR(vcpu) & IA64_PSR_DFH) {
+        FP_PSR(vcpu) = IA64_PSR_MFH;
+        if (__ia64_per_cpu_var(fp_owner) != vcpu)
+            __ia64_load_fpu(vcpu->arch._thread.fph);
+    }
+}
 
 void vmx_reflect_interruption(u64 ifa, u64 isr, u64 iim,
                               u64 vec, REGS *regs)
@@ -86,53 +93,65 @@ void vmx_reflect_interruption(u64 ifa, u
     u64 vpsr = VCPU(vcpu, vpsr);
     
     vector = vec2off[vec];
-    if(!(vpsr&IA64_PSR_IC)&&(vector!=IA64_DATA_NESTED_TLB_VECTOR)){
-        panic_domain(regs, "Guest nested fault vector=%lx!\n", vector);
-    }
 
     switch (vec) {
-
+    case 5:  // IA64_DATA_NESTED_TLB_VECTOR
+        break;
     case 22:	// IA64_INST_ACCESS_RIGHTS_VECTOR
+        if (!(vpsr & IA64_PSR_IC))
+            goto nested_fault;
         if (vhpt_access_rights_fixup(vcpu, ifa, 0))
             return;
         break;
 
     case 25:	// IA64_DISABLED_FPREG_VECTOR
-
-        if (FP_PSR(vcpu) & IA64_PSR_DFH) {
-            FP_PSR(vcpu) = IA64_PSR_MFH;
-            if (__ia64_per_cpu_var(fp_owner) != vcpu)
-                __ia64_load_fpu(vcpu->arch._thread.fph);
-        }
+        if (!(vpsr & IA64_PSR_IC))
+            goto nested_fault;
+        vmx_lazy_load_fpu(vcpu);
         if (!(VCPU(vcpu, vpsr) & IA64_PSR_DFH)) {
             regs->cr_ipsr &= ~IA64_PSR_DFH;
             return;
         }
 
         break;       
-        
+
     case 32:	// IA64_FP_FAULT_VECTOR
+        if (!(vpsr & IA64_PSR_IC))
+            goto nested_fault;
         // handle fpswa emulation
         // fp fault
         status = handle_fpu_swa(1, regs, isr);
         if (!status) {
             vcpu_increment_iip(vcpu);
             return;
-        } else if (IA64_RETRY == status)
-            return;
+        }
         break;
 
     case 33:	// IA64_FP_TRAP_VECTOR
+        if (!(vpsr & IA64_PSR_IC))
+            goto nested_fault;
         //fp trap
         status = handle_fpu_swa(0, regs, isr);
         if (!status)
             return;
-        else if (IA64_RETRY == status) {
-            vcpu_decrement_iip(vcpu);
+        break;
+
+    case 29: // IA64_DEBUG_VECTOR
+    case 35: // IA64_TAKEN_BRANCH_TRAP_VECTOR
+    case 36: // IA64_SINGLE_STEP_TRAP_VECTOR
+        if (vmx_guest_kernel_mode(regs)
+            && current->domain->debugger_attached) {
+            domain_pause_for_debugger();
             return;
         }
+        if (!(vpsr & IA64_PSR_IC))
+            goto nested_fault;
+        break;
+
+    default:
+        if (!(vpsr & IA64_PSR_IC))
+            goto nested_fault;
         break;
-    
     } 
     VCPU(vcpu,isr)=isr;
     VCPU(vcpu,iipa) = regs->cr_iip;
@@ -142,6 +161,10 @@ void vmx_reflect_interruption(u64 ifa, u
         set_ifa_itir_iha(vcpu,ifa,1,1,1);
     }
     inject_guest_interruption(vcpu, vector);
+    return;
+
+ nested_fault:
+    panic_domain(regs, "Guest nested fault vector=%lx!\n", vector);
 }
 
 
diff -Naurp xen/arch/ia64/vmx/vmx_utility.c xen-redhat/arch/ia64/vmx/vmx_utility.c
--- xen/arch/ia64/vmx/vmx_utility.c
+++ xen-redhat/arch/ia64/vmx/vmx_utility.c
@@ -26,7 +26,7 @@
 #include <asm/processor.h>
 #include <asm/vmx_mm_def.h>
 
-
+#ifdef CHECK_FAULT
 /*
  * Return:
  *  0:  Not reserved indirect registers
@@ -71,6 +71,7 @@ is_reserved_indirect_register (
     return 0;
 
 }
+#endif
 
 /*
  * Return:
@@ -207,7 +208,7 @@ check_psr_rsv_fields (u64 value)
 }
 
 
-
+#ifdef CHECK_FAULT
 /*
  * Return:
  *  1: CR reserved fields are not zero
@@ -310,9 +311,9 @@ check_cr_rsv_fields (int index, u64 valu
     panic ("Unsupported CR");
     return 0;
 }
+#endif
 
-
-
+#if 0
 /*
  * Return:
  *  0:  Indirect Reg reserved fields are not zero
@@ -361,7 +362,7 @@ check_indirect_reg_rsv_fields ( int type
 
     return 1;
 }
-
+#endif
 
 
 
diff -Naurp xen/arch/ia64/vmx/vmx_vcpu.c xen-redhat/arch/ia64/vmx/vmx_vcpu.c
--- xen/arch/ia64/vmx/vmx_vcpu.c
+++ xen-redhat/arch/ia64/vmx/vmx_vcpu.c
@@ -96,8 +96,7 @@ vmx_vcpu_set_psr(VCPU *vcpu, unsigned lo
      */
     VCPU(vcpu,vpsr) = value &
             (~ (IA64_PSR_ID |IA64_PSR_DA | IA64_PSR_DD |
-                IA64_PSR_SS | IA64_PSR_ED | IA64_PSR_IA
-            ));
+                IA64_PSR_ED | IA64_PSR_IA));
 
     if ( !old_psr.i && (value & IA64_PSR_I) ) {
         // vpsr.i 0->1
diff -Naurp xen/arch/ia64/vmx/vmx_virt.c xen-redhat/arch/ia64/vmx/vmx_virt.c
--- xen/arch/ia64/vmx/vmx_virt.c
+++ xen-redhat/arch/ia64/vmx/vmx_virt.c
@@ -178,8 +178,8 @@ static IA64FAULT vmx_emul_mov_to_psr(VCP
 {
     u64 val;
 
-    if(vcpu_get_gr_nat(vcpu, inst.M35.r2, &val) != IA64_NO_FAULT)
-	panic_domain(vcpu_regs(vcpu),"get_psr nat bit fault\n");
+    if (vcpu_get_gr_nat(vcpu, inst.M35.r2, &val) != IA64_NO_FAULT)
+        panic_domain(vcpu_regs(vcpu),"get_psr nat bit fault\n");
 
     return vmx_vcpu_set_psr_l(vcpu, val);
 }
@@ -892,7 +892,6 @@ static IA64FAULT vmx_emul_mov_to_rr(VCPU
 static IA64FAULT vmx_emul_mov_to_dbr(VCPU *vcpu, INST64 inst)
 {
     u64 r3,r2;
-    return IA64_NO_FAULT;
 #ifdef  CHECK_FAULT
     IA64_PSR vpsr;
     vpsr.val=vmx_vcpu_get_psr(vcpu);
@@ -916,7 +915,6 @@ static IA64FAULT vmx_emul_mov_to_dbr(VCP
 static IA64FAULT vmx_emul_mov_to_ibr(VCPU *vcpu, INST64 inst)
 {
     u64 r3,r2;
-    return IA64_NO_FAULT;
 #ifdef  CHECK_FAULT
     IA64_PSR vpsr;
     vpsr.val=vmx_vcpu_get_psr(vcpu);
@@ -934,7 +932,7 @@ static IA64FAULT vmx_emul_mov_to_ibr(VCP
         return IA64_FAULT;
 #endif  //CHECK_FAULT
     }
-    return (vmx_vcpu_set_ibr(vcpu,r3,r2));
+    return vmx_vcpu_set_ibr(vcpu,r3,r2);
 }
 
 static IA64FAULT vmx_emul_mov_to_pmc(VCPU *vcpu, INST64 inst)
@@ -1064,6 +1062,7 @@ static IA64FAULT vmx_emul_mov_from_pkr(V
 static IA64FAULT vmx_emul_mov_from_dbr(VCPU *vcpu, INST64 inst)
 {
     u64 r3,r1;
+    IA64FAULT res;
 #ifdef  CHECK_FAULT
     if(check_target_register(vcpu, inst.M43.r1)){
         set_illegal_op_isr(vcpu);
@@ -1094,13 +1093,16 @@ static IA64FAULT vmx_emul_mov_from_dbr(V
         return IA64_FAULT;
     }
 #endif  //CHECK_FAULT
-    vmx_vcpu_get_dbr(vcpu,r3,&r1);
+    res = vmx_vcpu_get_dbr(vcpu, r3, &r1);
+    if (res != IA64_NO_FAULT)
+        return res;
     return vcpu_set_gr(vcpu, inst.M43.r1, r1,0);
 }
 
 static IA64FAULT vmx_emul_mov_from_ibr(VCPU *vcpu, INST64 inst)
 {
     u64 r3,r1;
+    IA64FAULT res;
 #ifdef  CHECK_FAULT
     if(check_target_register(vcpu, inst.M43.r1)){
         set_illegal_op_isr(vcpu);
@@ -1131,7 +1133,9 @@ static IA64FAULT vmx_emul_mov_from_ibr(V
         return IA64_FAULT;
     }
 #endif  //CHECK_FAULT
-    vmx_vcpu_get_ibr(vcpu,r3,&r1);
+    res = vmx_vcpu_get_ibr(vcpu, r3, &r1);
+    if (res != IA64_NO_FAULT)
+        return res;
     return vcpu_set_gr(vcpu, inst.M43.r1, r1,0);
 }
 
@@ -1558,22 +1562,38 @@ if ( (cause == 0xff && opcode == 0x1e000
         break;
     case EVENT_VMSW:
         printk ("Unimplemented instruction %ld\n", cause);
-	status=IA64_FAULT;
+        status=IA64_FAULT;
         break;
     default:
-        panic_domain(regs,"unknown cause %ld, iip: %lx, ipsr: %lx\n", cause,regs->cr_iip,regs->cr_ipsr);
+        panic_domain(regs,"unknown cause %ld, iip: %lx, ipsr: %lx\n",
+                     cause,regs->cr_iip,regs->cr_ipsr);
         break;
     };
 
 #if 0
-    if (status == IA64_FAULT)
+    if (status != IA64_NO_FAULT)
 	panic("Emulation failed with cause %d:\n", cause);
 #endif
 
-    if ( status == IA64_NO_FAULT && cause !=EVENT_RFI ) {
-        vcpu_increment_iip(vcpu);
+    switch (status) {
+    case IA64_RSVDREG_FAULT:
+        set_rsv_reg_field_isr(vcpu);
+        rsv_reg_field(vcpu);
+        break;
+    case IA64_ILLOP_FAULT:
+        set_illegal_op_isr(vcpu);
+        illegal_op(vcpu);
+        break;
+    case IA64_FAULT:
+        /* Registers aleady set.  */
+        break;
+    case IA64_NO_FAULT:
+        if ( cause != EVENT_RFI )
+            vcpu_increment_iip(vcpu);
+        break;
     }
 
+
     recover_if_physical_mode(vcpu);
     return;
 
diff -Naurp xen/arch/ia64/vmx/vtlb.c xen-redhat/arch/ia64/vmx/vtlb.c
--- xen/arch/ia64/vmx/vtlb.c
+++ xen-redhat/arch/ia64/vmx/vtlb.c
@@ -21,34 +21,14 @@
  *  XiaoYan Feng (Fleming Feng) (Fleming.feng@intel.com)
  */
 
-#include <linux/sched.h>
-#include <asm/tlb.h>
-#include <xen/mm.h>
-#include <asm/vmx_mm_def.h>
-#include <asm/gcc_intrin.h>
-#include <linux/interrupt.h>
 #include <asm/vmx_vcpu.h>
-#include <asm/vmx_phy_mode.h>
-#include <asm/vmmu.h>
-#include <asm/tlbflush.h>
-#include <asm/regionreg.h>
-#define  MAX_CCH_LENGTH     40
 
 thash_data_t *__alloc_chain(thash_cb_t *);
 
-static void cch_mem_init(thash_cb_t *hcb)
+static inline void cch_mem_init(thash_cb_t *hcb)
 {
-    int num;
-    thash_data_t *p;
-
-    hcb->cch_freelist = p = hcb->cch_buf;
-    num = (hcb->cch_sz/sizeof(thash_data_t))-1;
-    do{
-        p->next =p+1;
-        p++;
-        num--;
-    }while(num);
-    p->next = NULL;
+    hcb->cch_free_idx = 0;
+    hcb->cch_freelist = NULL;
 }
 
 static thash_data_t *cch_alloc(thash_cb_t *hcb)
@@ -56,8 +36,16 @@ static thash_data_t *cch_alloc(thash_cb_
     thash_data_t *p;
     if ( (p = hcb->cch_freelist) != NULL ) {
         hcb->cch_freelist = p->next;
+        return p;
     }
-    return p;
+    if (hcb->cch_free_idx < hcb->cch_sz/sizeof(thash_data_t)) {
+        p = &((thash_data_t *)hcb->cch_buf)[hcb->cch_free_idx++];
+        p->page_flags = 0;
+        p->itir = 0;
+        p->next = NULL;
+        return p;
+    }
+    return NULL;
 }
 
 /*
@@ -298,6 +286,17 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte)
     return ret;
 }
 
+static thash_data_t * vtlb_thash(PTA vpta, u64 va, u64 vrr, u64 *tag)
+{
+    u64 index, pfn, rid;
+
+    pfn = REGION_OFFSET(va) >> _REGION_PAGE_SIZE(vrr);
+    rid = _REGION_ID(vrr);
+    index = (pfn ^ rid) & ((1UL << (vpta.size - 5)) - 1);
+    *tag = pfn ^ (rid << 39);
+    return (thash_data_t *)((vpta.base << PTA_BASE_SHIFT) + (index << 5));
+}
+
 /*
  *  purge software guest tlb
  */
@@ -320,7 +319,7 @@ static void vtlb_purge(VCPU *v, u64 va, 
         size = PSIZE(rr_ps);
         vrr.ps = rr_ps;
         while (num) {
-            cur = vsa_thash(hcb->pta, curadr, vrr.rrval, &tag);
+            cur = vtlb_thash(hcb->pta, curadr, vrr.rrval, &tag);
             while (cur) {
                 if (cur->etag == tag && cur->ps == rr_ps)
                     cur->etag = 1UL << 63;
@@ -413,7 +412,7 @@ void vtlb_insert(VCPU *v, u64 pte, u64 i
     vcpu_get_rr(v, va, &vrr.rrval);
     vrr.ps = itir_ps(itir);
     VMX(v, psbits[va >> 61]) |= (1UL << vrr.ps);
-    hash_table = vsa_thash(hcb->pta, va, vrr.rrval, &tag);
+    hash_table = vtlb_thash(hcb->pta, va, vrr.rrval, &tag);
     cch = hash_table;
     while (cch) {
         if (INVALID_TLB(cch)) {
@@ -556,13 +555,15 @@ void thash_purge_and_insert(VCPU *v, u64
         }
         else {
             u64 psr;
-            phy_pte  &= ~PAGE_FLAGS_RV_MASK;
-            psr = ia64_clear_ic();
-            ia64_itc(type + 1, ifa, phy_pte, ps);
-            ia64_set_psr(psr);
-            ia64_srlz_i();
-            // ps < mrr.ps, this is not supported
-            // panic_domain(NULL, "%s: ps (%lx) < mrr.ps \n", __func__, ps);
+            vtlb_insert(v, pte, itir, ifa);
+            vcpu_quick_region_set(PSCBX(v,tc_regions),ifa);
+            if (!(pte & VTLB_PTE_IO)) {
+		phy_pte  &= ~PAGE_FLAGS_RV_MASK;
+		psr = ia64_clear_ic();
+		ia64_itc(type + 1, ifa, phy_pte, ps);
+		ia64_set_psr(psr);
+		ia64_srlz_i();
+	    }
         }
     }
     else{
@@ -618,6 +619,30 @@ void thash_purge_all(VCPU *v)
     local_flush_tlb_all();
 }
 
+static void __thash_purge_all(void *arg)
+{
+    struct vcpu *v = arg;
+
+    BUG_ON(vcpu_runnable(v) || v->is_running);
+    thash_purge_all(v);
+}
+
+void vmx_vcpu_flush_vtlb_all(VCPU *v)
+{
+    if (v == current) {
+        thash_purge_all(v);
+        return;
+    }
+
+    /* SMP safe */
+    vcpu_pause(v);
+    if (v->processor == smp_processor_id())
+        __thash_purge_all(v);
+    else
+        smp_call_function_single(v->processor, __thash_purge_all, v, 1, 1);
+    vcpu_unpause(v);
+}
+
 
 /*
  * Lookup the hash table and its collision chain to find an entry
@@ -645,30 +670,38 @@ thash_data_t *vtlb_lookup(VCPU *v, u64 v
         ps = __ffs(psbits);
         psbits &= ~(1UL << ps);
         vrr.ps = ps;
-        cch = vsa_thash(hcb->pta, va, vrr.rrval, &tag);
+        cch = vtlb_thash(hcb->pta, va, vrr.rrval, &tag);
         do {
             if (cch->etag == tag && cch->ps == ps)
-                return cch;
+                goto found;
             cch = cch->next;
         } while(cch);
     }
     return NULL;
+found:
+    if (unlikely(!cch->ed && is_data == ISIDE_TLB)) {
+        /*The case is very rare, and it may lead to incorrect setting
+          for itlb's ed bit! Purge it from hash vTLB and let guest os
+          determin the ed bit of the itlb entry.*/
+        vtlb_purge(v, va, ps);
+        cch = NULL;
+    }
+    return cch;
 }
 
 
 /*
  * Initialize internal control data before service.
  */
-void thash_init(thash_cb_t *hcb, u64 sz)
+static void thash_init(thash_cb_t *hcb, u64 sz)
 {
     int num;
-    thash_data_t *head, *p;
+    thash_data_t *head;
 
     hcb->pta.val = (unsigned long)hcb->hash;
     hcb->pta.vf = 1;
     hcb->pta.ve = 1;
     hcb->pta.size = sz;
-    hcb->cch_rec_head = hcb->hash;
     
     head=hcb->hash;
     num = (hcb->hash_sz/sizeof(thash_data_t));
@@ -680,16 +713,47 @@ void thash_init(thash_cb_t *hcb, u64 sz)
         head++;
         num--;
     }while(num);
+
+    hcb->cch_free_idx = 0;
+    hcb->cch_freelist = NULL;
+}
+
+int thash_alloc(thash_cb_t *hcb, u64 sz_log2, char *what)
+{
+    struct page_info *page;
+    void * vbase;
+    u64 sz = 1UL << sz_log2;
+
+    page = alloc_domheap_pages(NULL, (sz_log2 + 1 - PAGE_SHIFT), 0);
+    if (page == NULL) {
+        printk("No enough contiguous memory(%ldKB) for init_domain_%s\n", 
+               sz >> (10 - 1), what);
+        return -ENOMEM;
+    }
+    vbase = page_to_virt(page);
+    memset(vbase, 0, sz + sz); // hash + collisions chain
+    if (sz_log2 >= 20 - 1)
+        printk(XENLOG_DEBUG "Allocate domain %s at 0x%p(%ldMB)\n", 
+               what, vbase, sz >> (20 - 1));
+    else
+        printk(XENLOG_DEBUG "Allocate domain %s at 0x%p(%ldKB)\n",
+               what, vbase, sz >> (10 - 1));
     
-    hcb->cch_freelist = p = hcb->cch_buf;
-    num = hcb->cch_sz / sizeof(thash_data_t);
-    do{
-        p->page_flags = 0;
-        p->itir = 0;
-        p->next =p+1;
-        p++;
-        num--;
-    }while(num);
+    hcb->hash = vbase;
+    hcb->hash_sz = sz;
+    hcb->cch_buf = (void *)((u64)vbase + hcb->hash_sz);
+    hcb->cch_sz = sz;
+    thash_init(hcb, sz_log2);
+    return 0;
+}
 
-    (p - 1)->next = NULL;
+void thash_free(thash_cb_t *hcb)
+{
+    struct page_info *page;
+
+    if (hcb->hash) {
+        page = virt_to_page(hcb->hash);
+        free_domheap_pages(page, hcb->pta.size + 1 - PAGE_SHIFT);
+        hcb->hash = 0;
+    }
 }
diff -Naurp xen/arch/ia64/xen/dom0_ops.c xen-redhat/arch/ia64/xen/dom0_ops.c
--- xen/arch/ia64/xen/dom0_ops.c
+++ xen-redhat/arch/ia64/xen/dom0_ops.c
@@ -214,6 +214,39 @@ long arch_do_domctl(xen_domctl_t *op, XE
     }
     break;
 
+    case XEN_DOMCTL_set_address_size:
+    {
+        struct domain *d = rcu_lock_domain_by_id(op->domain);
+
+        ret = -ESRCH;
+        if (d == NULL)
+            break;
+
+        ret = -EINVAL;
+        if (op->u.address_size.size == BITS_PER_LONG)
+            ret = 0;
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_get_address_size:
+    {
+        struct domain *d = rcu_lock_domain_by_id(op->domain);
+
+        ret = -ESRCH;
+        if (d  == NULL)
+            break;
+
+        ret = 0;
+        op->u.address_size.size = BITS_PER_LONG;
+        rcu_unlock_domain(d);
+
+        if (copy_to_guest(u_domctl, op, 1))
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         printk("arch_do_domctl: unrecognized domctl: %d!!!\n",op->cmd);
         ret = -ENOSYS;
@@ -223,12 +256,6 @@ long arch_do_domctl(xen_domctl_t *op, XE
     return ret;
 }
 
-/*
- * Temporarily disable the NUMA PHYSINFO code until the rest of the
- * changes are upstream.
- */
-#undef IA64_NUMA_PHYSINFO
-
 long arch_do_sysctl(xen_sysctl_t *op, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
 {
     long ret = 0;
@@ -237,84 +264,68 @@ long arch_do_sysctl(xen_sysctl_t *op, XE
     {
     case XEN_SYSCTL_physinfo:
     {
-#ifdef IA64_NUMA_PHYSINFO
-        int i;
-        node_data_t *chunks;
-        u64 *map, cpu_to_node_map[MAX_NUMNODES];
-#endif
+        int i, node_cpus = 0;
+        uint32_t max_array_ent;
+        XEN_GUEST_HANDLE_64(uint32_t) cpu_to_node_arr;
 
         xen_sysctl_physinfo_t *pi = &op->u.physinfo;
 
-        pi->threads_per_core =
-            cpus_weight(cpu_sibling_map[0]);
+        max_array_ent = pi->max_cpu_id;
+        cpu_to_node_arr = pi->cpu_to_node;
+
+        pi->cpu_to_node = cpu_to_node_arr;
+        pi->threads_per_core = cpus_weight(cpu_sibling_map[0]);
         pi->cores_per_socket =
             cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
-        pi->sockets_per_node = 
-            num_online_cpus() / cpus_weight(cpu_core_map[0]);
-#ifndef IA64_NUMA_PHYSINFO
-        pi->nr_nodes         = 1; 
-#endif
+        pi->nr_nodes         = num_online_nodes();
+        /*
+         * RHEL5 ABI compat:
+         * Newer userspace expects 'sockets_per_node' to actually
+         * contain 'nr_cpus' data.
+         */
+        if (op->interface_version > XEN_SYSCTL_INTERFACE_VERSION)
+            pi->sockets_per_node = (u32)num_online_cpus();
+        else
+            {
+                /*
+                 * Guess at a sockets_per_node value.  Use the maximum number of
+                 * CPUs per node to avoid deconfigured CPUs breaking the average.
+                 */
+                for_each_online_node(i)
+                    node_cpus = max(node_cpus, cpus_weight(node_to_cpumask(i)));
+                
+                pi->sockets_per_node = node_cpus / 
+                    (pi->cores_per_socket * pi->threads_per_core);
+            }
+
         pi->total_pages      = total_pages; 
         pi->free_pages       = avail_domheap_pages();
-        pi->scrub_pages      = avail_scrub_pages();
+        pi->scrub_pages      = 0;
         pi->cpu_khz          = local_cpu_data->proc_freq / 1000;
         memset(pi->hw_cap, 0, sizeof(pi->hw_cap));
-        //memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
+
+        pi->max_cpu_id = last_cpu(cpu_online_map);
+        max_array_ent = min_t(uint32_t, max_array_ent, pi->max_cpu_id);
+
         ret = 0;
 
-#ifdef IA64_NUMA_PHYSINFO
-        /* fetch memory_chunk pointer from guest */
-        get_xen_guest_handle(chunks, pi->memory_chunks);
-
-        printk("chunks=%p, num_node_memblks=%u\n", chunks, num_node_memblks);
-        /* if it is set, fill out memory chunk array */
-        if (chunks != NULL) {
-            if (num_node_memblks == 0) {
-                /* Non-NUMA machine.  Put pseudo-values.  */
-                node_data_t data;
-                data.node_start_pfn = 0;
-                data.node_spanned_pages = total_pages;
-                data.node_id = 0;
-                /* copy memory chunk structs to guest */
-                if (copy_to_guest_offset(pi->memory_chunks, 0, &data, 1)) {
-                    ret = -EFAULT;
-                    break;
-                }
-            } else {
-                for (i = 0; i < num_node_memblks && i < PUBLIC_MAXCHUNKS; i++) {
-                    node_data_t data;
-                    data.node_start_pfn = node_memblk[i].start_paddr >>
-                                          PAGE_SHIFT;
-                    data.node_spanned_pages = node_memblk[i].size >> PAGE_SHIFT;
-                    data.node_id = node_memblk[i].nid;
-                    /* copy memory chunk structs to guest */
-                    if (copy_to_guest_offset(pi->memory_chunks, i, &data, 1)) {
+        /*
+         * RHEL5 ABI compat:
+         * Only fill in extended NUMA info if a newer userspace
+         * is talking to us
+         */
+        if (op->interface_version > XEN_SYSCTL_INTERFACE_VERSION)
+        {
+            if (!guest_handle_is_null(cpu_to_node_arr)) {
+                for (i = 0; i <= max_array_ent; i++) {
+                    uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u;
+                    if (copy_to_guest_offset(cpu_to_node_arr, i, &node, 1)) {
                         ret = -EFAULT;
                         break;
                     }
                 }
             }
         }
-        /* set number of notes */
-        pi->nr_nodes = num_online_nodes();
-
-        /* fetch cpu_to_node pointer from guest */
-        get_xen_guest_handle(map, pi->cpu_to_node);
-
-        /* if set, fill out cpu_to_node array */
-        if (map != NULL) {
-            /* copy cpu to node mapping to domU */
-            memset(cpu_to_node_map, 0, sizeof(cpu_to_node_map));
-            for (i = 0; i < num_online_cpus(); i++) {
-                cpu_to_node_map[i] = cpu_to_node(i);
-                if (copy_to_guest_offset(pi->cpu_to_node, i,
-                                         &(cpu_to_node_map[i]), 1)) {
-                    ret = -EFAULT;
-                    break;
-                }
-            }
-        }
-#endif
 
         if ( copy_to_guest(u_sysctl, op, 1) )
             ret = -EFAULT;
diff -Naurp xen/arch/ia64/xen/domain.c xen-redhat/arch/ia64/xen/domain.c
--- xen/arch/ia64/xen/domain.c
+++ xen-redhat/arch/ia64/xen/domain.c
@@ -52,10 +52,11 @@
 #include <asm/perfmon.h>
 #include <public/vcpu.h>
 
-unsigned long dom0_size = 512*1024*1024;
+/* dom0_size: default memory allocation for dom0 (~4GB) */
+unsigned long dom0_size = 4096UL*1024UL*1024UL;
 
 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0.  */
-static unsigned int dom0_max_vcpus = 1;
+static unsigned int dom0_max_vcpus = 4;
 integer_param("dom0_max_vcpus", dom0_max_vcpus); 
 
 extern unsigned long running_on_sim;
@@ -237,6 +238,14 @@ void context_switch(struct vcpu *prev, s
     ia64_disable_vhpt_walker();
     lazy_fp_switch(prev, current);
 
+    if (prev->arch.dbg_used || next->arch.dbg_used) {
+        /*
+         * Load debug registers either because they are valid or to clear
+         * the previous one.
+         */
+        ia64_load_debug_regs(next->arch.dbr);
+    }
+    
     prev = ia64_switch_to(next);
 
     /* Note: ia64_switch_to does not return here at vcpu initialization.  */
@@ -336,7 +345,6 @@ static void continue_cpu_idle_loop(void)
 #else
 	    irq_stat[cpu].idle_timestamp = jiffies;
 #endif
-	    page_scrub_schedule_work();
 	    while ( !softirq_pending(smp_processor_id()) )
 	        default_idle();
 	    raise_softirq(SCHEDULE_SOFTIRQ);
@@ -553,6 +561,9 @@ int arch_domain_create(struct domain *d)
 		goto fail_nomem;
 
 	memset(&d->arch.mm, 0, sizeof(d->arch.mm));
+ 	d->arch.relres = RELRES_not_started;
+        d->arch.mm_teardown_offset = 0;
+ 	INIT_LIST_HEAD(&d->arch.relmem_list);
 
 	if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
 	    goto fail_nomem;
@@ -695,13 +706,14 @@ int arch_set_info_guest(struct vcpu *v, 
 	return 0;
 }
 
-static void relinquish_memory(struct domain *d, struct list_head *list)
+static int relinquish_memory(struct domain *d, struct list_head *list)
 {
     struct list_head *ent;
     struct page_info *page;
 #ifndef __ia64__
     unsigned long     x, y;
 #endif
+    int               ret = 0;
 
     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
     spin_lock_recursive(&d->page_alloc_lock);
@@ -714,6 +726,7 @@ static void relinquish_memory(struct dom
         {
             /* Couldn't get a reference -- someone is freeing this page. */
             ent = ent->next;
+            list_move_tail(&page->list, &d->arch.relmem_list);
             continue;
         }
 
@@ -750,30 +763,72 @@ static void relinquish_memory(struct dom
         /* Follow the list chain and /then/ potentially free the page. */
         ent = ent->next;
         BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
+        list_move_tail(&page->list, &d->arch.relmem_list);
         put_page(page);
+
+        if (hypercall_preempt_check()) {
+                ret = -EAGAIN;
+                goto out;
+        }
     }
 
+    list_splice_init(&d->arch.relmem_list, list);
+
+ out:
     spin_unlock_recursive(&d->page_alloc_lock);
+    return ret;
 }
 
-void domain_relinquish_resources(struct domain *d)
+int domain_relinquish_resources(struct domain *d)
 {
-    /* Relinquish guest resources for VT-i domain. */
-    if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
-	    vmx_relinquish_guest_resources(d);
+	int ret = 0;
 
-    /* Tear down shadow mode stuff. */
-    mm_teardown(d);
+	switch (d->arch.relres) {
+	case RELRES_not_started:
+		/* Relinquish guest resources for VT-i domain. */
+		if (d->arch.is_vti)
+			vmx_relinquish_guest_resources(d);
+		d->arch.relres = RELRES_mm_teardown;
+		/*fallthrough*/
+
+	case RELRES_mm_teardown:
+		/* Tear down shadow mode stuff. */
+		ret = mm_teardown(d);
+		if (ret != 0)
+			return ret;
+		d->arch.relres = RELRES_xen;
+		/* fallthrough */
+
+	case RELRES_xen:
+		/* Relinquish every xen page of memory. */
+		ret = relinquish_memory(d, &d->xenpage_list);
+		if (ret != 0)
+			return ret;
+		d->arch.relres = RELRES_dom;
+		/* fallthrough */
+
+	case RELRES_dom:
+		/* Relinquish every domain page of memory. */
+		ret = relinquish_memory(d, &d->page_list);
+		if (ret != 0)
+			return ret;
+		d->arch.relres = RELRES_done;
+		/* fallthrough */    
 
-    /* Relinquish every page of memory. */
-    relinquish_memory(d, &d->xenpage_list);
-    relinquish_memory(d, &d->page_list);
+	case RELRES_done:
+		break;
+
+	default:
+		BUG();
+	}
 
-    if (d->arch.is_vti && d->arch.sal_data)
-	    xfree(d->arch.sal_data);
+	if (d->arch.is_vti && d->arch.sal_data)
+		xfree(d->arch.sal_data);
 
-    /* Free page used by xen oprofile buffer */
-    free_xenoprof_pages(d);
+	/* Free page used by xen oprofile buffer */
+	free_xenoprof_pages(d);
+
+	return 0;
 }
 
 unsigned long
@@ -1015,8 +1070,41 @@ static void loaddomainelfimage(struct do
 	}
 }
 
-void alloc_dom0(void)
+static void calc_dom0_size(void)
 {
+	unsigned long domheap_pages;
+	unsigned long p2m_pages;
+	unsigned long spare_hv_pages;
+	unsigned long max_dom0_size;
+
+	/* Estimate maximum memory we can safely allocate for dom0
+	 * by subtracting the p2m table allocation and a chunk of memory
+	 * for DMA and PCI mapping from the available domheap pages. The
+	 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
+	 * to have a good idea of what those requirements might be ahead
+	 * of time, calculated at 128MB + 1MB per 4GB of system memory */
+	domheap_pages = avail_domheap_pages();
+	p2m_pages = domheap_pages / PTRS_PER_PTE;
+	spare_hv_pages = 8192 + (domheap_pages / 4096);
+	max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
+			 * PAGE_SIZE;
+	printk("Maximum permitted dom0 size: %luMB\n",
+	       max_dom0_size / (1024*1024));
+
+	/* validate proposed dom0_size, fix up as needed */
+	if (dom0_size > max_dom0_size) {
+		printk("Reducing dom0 memory allocation from %luK to %luK "
+		       "to fit available memory\n",
+		       dom0_size / 1024, max_dom0_size / 1024);
+		dom0_size = max_dom0_size;
+	}
+
+	/* dom0_mem=0 can be passed in to give all available mem to dom0 */
+	if (dom0_size == 0) {
+		printk("Allocating all available memory to dom0\n");
+		dom0_size = max_dom0_size;
+	}
+
 	/* Check dom0 size.  */
 	if (dom0_size < 4 * 1024 * 1024) {
 		panic("dom0_mem is too small, boot aborted"
@@ -1081,6 +1169,8 @@ int construct_dom0(struct domain *d, 
 
 	printk("*** LOADING DOMAIN 0 ***\n");
 
+	calc_dom0_size();
+
 	max_pages = dom0_size / PAGE_SIZE;
 	d->max_pages = max_pages;
 	d->tot_pages = 0;
@@ -1260,10 +1350,12 @@ extern void cpu_halt(void);
 void machine_halt(void)
 {
 	console_start_sync();
-	if (running_on_sim)
-		printk ("machine_halt called.  spinning...\n");
-	else
-		cpu_halt();
+
+#ifdef CONFIG_SMP
+	smp_send_stop();
+#endif
+
+	printk ("machine_halt called.  spinning...\n");
 	while(1);
 }
 
diff -Naurp xen/arch/ia64/xen/dom_fw.c xen-redhat/arch/ia64/xen/dom_fw.c
--- xen/arch/ia64/xen/dom_fw.c
+++ xen-redhat/arch/ia64/xen/dom_fw.c
@@ -144,6 +144,117 @@ build_pal_hypercall_bundles(u64 *imva, u
 	ia64_fc(imva + 3);
 }
 
+/* xen fpswa call stub. 14 bundles */
+extern const unsigned long xen_ia64_fpswa_call_stub[];
+extern const unsigned long xen_ia64_fpswa_call_stub_end[];
+extern const unsigned long xen_ia64_fpswa_call_stub_patch[];
+asm(
+	".align 32\n"
+	".proc xen_ia64_fpswa_call_stub;\n"
+	"xen_ia64_fpswa_call_stub:\n"
+	".prologue\n"
+	"alloc r3 = ar.pfs, 8, 0, 0, 0\n"
+	".body\n"
+	"mov r14 = in0\n"
+	"ld8 r15 = [in1], 8\n"
+	";;\n"
+	"ld8 r16 = [in1]\n"
+	"ld8 r17 = [in2]\n"
+	"ld8 r18 = [in3]\n"
+	"ld8 r19 = [in4]\n"
+	"ld8 r20 = [in5]\n"
+	"ld8 r21 = [in6]\n"
+	"ld8 r22 = [in7], 8\n"
+	";;\n"
+	"ld8 r23 = [in7], 8\n"
+	";;\n"
+	"ld8 r24 = [in7], 8\n"
+	";;\n"
+	"cmp.ne p6, p0 = r24, r0\n"
+	"ld8 r25 = [in7], 8\n"
+	";;\n"
+	"(p6) tpa r24 = r24\n"
+	"cmp.ne p7, p0 = r25, r0\n"
+	"ld8 r26 = [in7], 8\n"
+	";;\n"
+	"(p7)tpa r25 = r25\n"
+	"cmp.ne p8, p0 = r26, r0\n"
+	"ld8 r27 = [in7], 8\n"
+	";;\n"
+	"(p8)tpa r26 = r26\n"
+	"cmp.ne p9, p0 = r27, r0\n"
+	";;\n"
+	"tpa r27 = r27\n"
+	"xen_ia64_fpswa_call_stub_patch:"
+	"{\n"
+	"mov r2 = " FW_HYPERCALL_FPSWA_STR "\n"
+	"break " __IA64_XEN_HYPERCALL_DEFAULT_STR "\n"
+	"nop.i 0\n"
+	"}\n"
+	"st8 [in2] = r17\n"
+	"st8 [in3] = r18\n"
+	"st8 [in4] = r19\n"
+	"st8 [in5] = r20\n"
+	"st8 [in6] = r21\n"
+	"br.ret.sptk.many rp\n"
+	"xen_ia64_fpswa_call_stub_end:"
+	".endp xen_ia64_fpswa_call_stub\n"
+);
+
+static void
+build_fpswa_hypercall_bundle(uint64_t *imva, uint64_t brkimm, uint64_t hypnum)
+{
+	INST64_A5 slot0;
+	INST64_I19 slot1;
+	INST64_I18 slot2;
+	IA64_BUNDLE bundle;
+
+	/* slot0: mov r2 = hypnum (low 20 bits) */
+	slot0.inst = 0;
+	slot0.qp = 0;
+	slot0.r1 = 2;
+	slot0.r3 = 0;
+	slot0.major = 0x9;
+
+	slot0.s = 0;
+	slot0.imm9d = hypnum >> 7;
+	slot0.imm5c = hypnum >> 16;
+	slot0.imm7b = hypnum;
+
+	/* slot1: break brkimm */
+	slot1.inst = 0;
+	slot1.qp = 0;
+	slot1.x6 = 0;
+	slot1.x3 = 0;
+	slot1.major = 0x0;
+	slot1.i = brkimm >> 20;
+	slot1.imm20 = brkimm;
+
+	/* slot2: nop.i */
+	slot2.inst = 0;
+	slot2.qp = 0;
+	slot2.imm20 = 0;
+	slot2.y = 0;
+	slot2.x6 = 1;
+	slot2.x3 = 0;
+	slot2.i = 0;
+	slot2.major = 0;
+
+	/* MII bundle */
+	bundle.i64[0] = 0;
+	bundle.i64[1] = 0;
+	bundle.template = 0x0; /* MII */
+	bundle.slot0 = slot0.inst;
+	bundle.slot1a = slot1.inst;
+	bundle.slot1b = slot1.inst >> 18;
+	bundle.slot2 = slot2.inst;
+	
+	imva[0] = bundle.i64[0];
+	imva[1] = bundle.i64[1];
+	ia64_fc(imva);
+	ia64_fc(imva + 1);
+}
+
 // builds a hypercall bundle at domain physical address
 static void
 dom_fpswa_hypercall_patch(struct domain *d, unsigned long imva)
@@ -151,6 +262,10 @@ dom_fpswa_hypercall_patch(struct domain 
 	unsigned long *entry_imva, *patch_imva;
 	const unsigned long entry_paddr = FW_HYPERCALL_FPSWA_ENTRY_PADDR;
 	const unsigned long patch_paddr = FW_HYPERCALL_FPSWA_PATCH_PADDR;
+	const size_t stub_size =
+		(char*)xen_ia64_fpswa_call_stub_end -
+		(char*)xen_ia64_fpswa_call_stub;
+	size_t i;
 
 	entry_imva = (unsigned long *)(imva + entry_paddr -
 	                               FW_HYPERCALL_BASE_PADDR);
@@ -160,9 +275,19 @@ dom_fpswa_hypercall_patch(struct domain 
 	/* Descriptor.  */
 	*entry_imva++ = patch_paddr;
 	*entry_imva   = 0;
+        /* see dom_fw.h */
+        BUILD_BUG_ON((char*)xen_ia64_fpswa_call_stub_end -
+                     (char*)xen_ia64_fpswa_call_stub > 0xff - 16);
+ 
+        /* call stub */
+        memcpy(patch_imva, xen_ia64_fpswa_call_stub, stub_size);
+        for (i = 0; i < stub_size; i++)
+                ia64_fc(imva + i);
+        patch_imva +=
+                xen_ia64_fpswa_call_stub_patch - xen_ia64_fpswa_call_stub;
+        build_fpswa_hypercall_bundle(patch_imva, d->arch.breakimm,
+							FW_HYPERCALL_FPSWA);
 
-	build_hypercall_bundle(patch_imva, d->arch.breakimm,
-	                       FW_HYPERCALL_FPSWA, 1);
 }
 
 // builds a hypercall bundle at domain physical address
@@ -489,7 +614,7 @@ efi_mdt_cmp(const void *a, const void *b
 
 #define NFUNCPTRS 16
 #define NUM_EFI_SYS_TABLES 6
-#define NUM_MEM_DESCS 64 //large enough
+#define NUM_MEM_DESCS 256 //large enough
 
 struct fw_tables {
 	efi_system_table_t efi_systab;
diff -Naurp xen/arch/ia64/xen/faults.c xen-redhat/arch/ia64/xen/faults.c
--- xen/arch/ia64/xen/faults.c
+++ xen-redhat/arch/ia64/xen/faults.c
@@ -93,6 +93,8 @@ void reflect_interruption(unsigned long 
 	regs->cr_ipsr = (regs->cr_ipsr & ~DELIVER_PSR_CLR) | DELIVER_PSR_SET;
 	if (PSCB(v, dcr) & IA64_DCR_BE)
 		regs->cr_ipsr |= IA64_PSR_BE;
+	else
+		regs->cr_ipsr &= ~IA64_PSR_BE;
 
 	if (PSCB(v, hpsr_dfh))
 		regs->cr_ipsr |= IA64_PSR_DFH;  
@@ -158,6 +160,8 @@ void reflect_event(void)
 	regs->cr_ipsr = (regs->cr_ipsr & ~DELIVER_PSR_CLR) | DELIVER_PSR_SET;
 	if (PSCB(v, dcr) & IA64_DCR_BE)
 		regs->cr_ipsr |= IA64_PSR_BE;
+	else
+		regs->cr_ipsr &= ~IA64_PSR_BE;
 
 	if (PSCB(v, hpsr_dfh))
 		regs->cr_ipsr |= IA64_PSR_DFH;
@@ -272,6 +276,11 @@ void ia64_do_page_fault(unsigned long ad
 		regs->cr_ipsr =
 		    (regs->cr_ipsr & ~DELIVER_PSR_CLR) | DELIVER_PSR_SET;
 
+		if (PSCB(current, dcr) & IA64_DCR_BE)
+			regs->cr_ipsr |= IA64_PSR_BE;
+		else
+			regs->cr_ipsr &= ~IA64_PSR_BE;
+
 		if (PSCB(current, hpsr_dfh))
 			regs->cr_ipsr |= IA64_PSR_DFH;  
 		PSCB(current, vpsr_dfh) = 0;
@@ -340,10 +349,10 @@ fp_emulate(int fp_fault, void *bundle, u
 unsigned long
 handle_fpu_swa(int fp_fault, struct pt_regs *regs, unsigned long isr)
 {
-	struct vcpu *v = current;
 	IA64_BUNDLE bundle;
 	unsigned long fault_ip;
 	fpswa_ret_t ret;
+	unsigned long rc;
 
 	fault_ip = regs->cr_iip;
 	/*
@@ -355,23 +364,25 @@ handle_fpu_swa(int fp_fault, struct pt_r
 		fault_ip -= 16;
 
 	if (VMX_DOMAIN(current)) {
-		if (IA64_RETRY == __vmx_get_domain_bundle(fault_ip, &bundle))
-			return IA64_RETRY;
-	} else
-		bundle = __get_domain_bundle(fault_ip);
-
-	if (!bundle.i64[0] && !bundle.i64[1]) {
-		printk("%s: floating-point bundle at 0x%lx not mapped\n",
-		       __FUNCTION__, fault_ip);
-		return -1;
+		rc = __vmx_get_domain_bundle(fault_ip, &bundle);
+	} else {
+		rc = 0;
+		if (vcpu_get_domain_bundle(current, regs, fault_ip,
+					   &bundle) == 0)
+			rc = IA64_RETRY;
+	}
+	if (rc == IA64_RETRY) {
+		gdprintk(XENLOG_DEBUG,
+			 "%s(%s): floating-point bundle at 0x%lx not mapped\n",
+			 __FUNCTION__, fp_fault ? "fault" : "trap", fault_ip);
+		return IA64_RETRY;
 	}
 
 	ret = fp_emulate(fp_fault, &bundle, &regs->cr_ipsr, &regs->ar_fpsr,
 	                 &isr, &regs->pr, &regs->cr_ifs, regs);
 
 	if (ret.status) {
-		PSCBX(v, fpswa_ret) = ret;
-		printk("%s(%s): fp_emulate() returned %ld\n",
+		gdprintk(XENLOG_ERR, "%s(%s): fp_emulate() returned %ld\n",
 		       __FUNCTION__, fp_fault ? "fault" : "trap", ret.status);
 	}
 
@@ -434,6 +445,13 @@ ia64_fault(unsigned long vector, unsigne
 		printk("Dirty-bit.\n");
 		break;
 
+	case 10:
+		/* __domain_get_bundle() may cause fault. */
+		if (ia64_done_with_exception(regs))
+			return;
+		printk("Data Access-bit.\n");
+		break;
+
 	case 20:
 		printk("Page Not Found.\n");
 		break;
@@ -584,6 +602,17 @@ ia64_handle_privop(unsigned long ifa, st
 }
 
 void
+ia64_lazy_load_fpu(struct vcpu *v)
+{
+	if (PSCB(v, hpsr_dfh)) {
+		PSCB(v, hpsr_dfh) = 0;
+		PSCB(v, hpsr_mfh) = 1;
+		if (__ia64_per_cpu_var(fp_owner) != v)
+			__ia64_load_fpu(v->arch._thread.fph);
+	}
+}
+
+void
 ia64_handle_reflection(unsigned long ifa, struct pt_regs *regs,
                        unsigned long isr, unsigned long iim,
                        unsigned long vector)
@@ -622,12 +651,7 @@ ia64_handle_reflection(unsigned long ifa
 		vector = IA64_GENEX_VECTOR;
 		break;
 	case 25:
-		if (PSCB(v, hpsr_dfh)) {
-			PSCB(v, hpsr_dfh) = 0;
-			PSCB(v, hpsr_mfh) = 1;
-			if (__ia64_per_cpu_var(fp_owner) != v)
-				__ia64_load_fpu(v->arch._thread.fph);
-		}
+		ia64_lazy_load_fpu(v);
 		if (!PSCB(v, vpsr_dfh)) {
 			regs->cr_ipsr &= ~IA64_PSR_DFH;
 			return;
@@ -638,8 +662,6 @@ ia64_handle_reflection(unsigned long ifa
 		if (((isr >> 4L) & 0xfL) == 1) {
 			/* Fault is due to a register NaT consumption fault. */
 			//regs->eml_unat = 0;  FIXME: DO WE NEED THIS??
-			printk("ia64_handle_reflection: handling regNaT "
-			       "fault\n");
 			vector = IA64_NAT_CONSUMPTION_VECTOR;
 			break;
 		}
@@ -674,6 +696,11 @@ ia64_handle_reflection(unsigned long ifa
 		PSCB(current, iim) = iim;
 		vector = IA64_SPECULATION_VECTOR;
 		break;
+	case 29:
+		vector = IA64_DEBUG_VECTOR;
+		if (debugger_trap_entry(vector, regs))
+			return;
+		break;
 	case 30:
 		// FIXME: Should we handle unaligned refs in Xen??
 		vector = IA64_UNALIGNED_REF_VECTOR;
@@ -684,33 +711,31 @@ ia64_handle_reflection(unsigned long ifa
 			vcpu_increment_iip(v);
 			return;
 		}
-		// fetch code fail
-		if (IA64_RETRY == status)
-			return;
-		printk("ia64_handle_reflection: handling FP fault\n");
 		vector = IA64_FP_FAULT_VECTOR;
 		break;
 	case 33:
 		status = handle_fpu_swa(0, regs, isr);
 		if (!status)
 			return;
-		// fetch code fail
-		if (IA64_RETRY == status)
-			return;
-		printk("ia64_handle_reflection: handling FP trap\n");
 		vector = IA64_FP_TRAP_VECTOR;
 		break;
 	case 34:
-		printk("ia64_handle_reflection: handling lowerpriv trap\n");
+		if (isr & (1UL << 4))
+			printk("ia64_handle_reflection: handling "
+			       "unimplemented instruction address %s\n",
+			       (isr & (1UL<<32)) ? "fault" : "trap");
 		vector = IA64_LOWERPRIV_TRANSFER_TRAP_VECTOR;
 		break;
 	case 35:
 		printk("ia64_handle_reflection: handling taken branch trap\n");
 		vector = IA64_TAKEN_BRANCH_TRAP_VECTOR;
+		if (debugger_trap_entry(vector,regs))
+			return;
 		break;
 	case 36:
-		printk("ia64_handle_reflection: handling single step trap\n");
 		vector = IA64_SINGLE_STEP_TRAP_VECTOR;
+		if (debugger_trap_entry(vector,regs))
+			return;
 		break;
 
 	default:
diff -Naurp xen/arch/ia64/xen/fw_emul.c xen-redhat/arch/ia64/xen/fw_emul.c
--- xen/arch/ia64/xen/fw_emul.c
+++ xen-redhat/arch/ia64/xen/fw_emul.c
@@ -35,6 +35,7 @@
 #include <xen/hypercall.h>
 #include <xen/softirq.h>
 #include <xen/time.h>
+#include <asm/vmx_phy_mode.h>
 
 static DEFINE_SPINLOCK(efi_time_services_lock);
 
@@ -240,6 +241,8 @@ sal_emulator (long index, unsigned long 
 			}
 			e = list_entry(sal_queue[in1].next,
 			               sal_queue_entry_t, list);
+
+			list_del(&e->list);
 			spin_unlock_irqrestore(&sal_queue_lock, flags);
 
 			IA64_SAL_DEBUG("SAL_GET_STATE_INFO(%s <= %s) "
@@ -275,10 +278,12 @@ sal_emulator (long index, unsigned long 
 			r9 = arg.ret;
 			status = arg.status;
 			if (r9 == 0) {
+				xfree(e);
+			} else {
+				/* Re-add the entry to sal_queue */
 				spin_lock_irqsave(&sal_queue_lock, flags);
-				list_del(&e->list);
+				list_add(&e->list, &sal_queue[in1]);
 				spin_unlock_irqrestore(&sal_queue_lock, flags);
-				xfree(e);
 			}
 		} else {
 			status = IA64_SAL_NO_INFORMATION_AVAILABLE;
@@ -314,10 +319,10 @@ sal_emulator (long index, unsigned long 
 			               "on CPU#%d.\n",
 			               rec_name[e->sal_info_type],
 			               rec_name[in1], e->cpuid);
-			
 
 			arg.type = e->sal_info_type;
 			arg.status = 0;
+
 			if (e->cpuid == smp_processor_id()) {
 				IA64_SAL_DEBUG("SAL_CLEAR_STATE_INFO: local\n");
 				clear_state_info_on(&arg);
@@ -446,6 +451,45 @@ sal_emulator (long index, unsigned long 
 	return ((struct sal_ret_values) {status, r9, r10, r11});
 }
 
+static int
+safe_copy_to_guest(unsigned long to, void *from, long size)
+{
+	BUG_ON((unsigned)size > PAGE_SIZE);
+
+	if (VMX_DOMAIN(current)) {
+		if (is_virtual_mode(current)) {
+			thash_data_t *data;
+			unsigned long gpa, poff;
+
+			/* The caller must provide a DTR or DTC mapping */
+			data = vtlb_lookup(current, to, DSIDE_TLB);
+			if (data) {
+				gpa = data->page_flags & _PAGE_PPN_MASK;
+			} else {
+				data = vhpt_lookup(to);
+				if (!data)
+					return -1;
+				gpa = __mpa_to_gpa(
+					data->page_flags & _PAGE_PPN_MASK);
+				gpa &= _PAGE_PPN_MASK;
+			}
+			poff = POFFSET(to, data->ps);
+			if (poff + size > PSIZE(data->ps))
+				return -1;
+			to = PAGEALIGN(gpa, data->ps) | poff;
+		}
+		to |= XENCOMM_INLINE_FLAG;
+		if (xencomm_copy_to_guest((void *)to, from, size, 0) != 0)
+			return -1;
+		return 0;
+	} else {
+		/* check for vulnerability */
+		if (IS_VMM_ADDRESS(to) || IS_VMM_ADDRESS(to + size - 1))
+			panic_domain(NULL, "copy to bad address:0x%lx\n", to);
+		return copy_to_user((void __user *)to, from, size);
+	}
+}
+
 cpumask_t cpu_cache_coherent_map;
 
 struct cache_flush_args {
@@ -468,6 +512,19 @@ remote_pal_cache_flush(void *v)
 		args->status = status;
 }
 
+static void
+remote_pal_prefetch_visibility(void *v)
+{
+	s64 trans_type = (s64)v;
+	ia64_pal_prefetch_visibility(trans_type);
+}
+
+static void
+remote_pal_mc_drain(void *v)
+{
+	ia64_pal_mc_drain();
+}
+
 struct ia64_pal_retval
 xen_pal_emulator(unsigned long index, u64 in1, u64 in2, u64 in3)
 {
@@ -682,16 +739,13 @@ xen_pal_emulator(unsigned long index, u6
 					pm_buffer,
 					(pal_perf_mon_info_u_t *) &r9);
 			if (status != 0) {
-				while(1)
 				printk("PAL_PERF_MON_INFO fails ret=%ld\n", status);
 				break;
 			}
-			if (copy_to_user((void __user *)in1,pm_buffer,128)) {
-				while(1)
-				printk("xen_pal_emulator: PAL_PERF_MON_INFO "
-					"can't copy to user!!!!\n");
-				status = PAL_STATUS_UNIMPLEMENTED;
-				break;
+			if (safe_copy_to_guest(
+				in1, pm_buffer, sizeof(pm_buffer))) {
+				status = PAL_STATUS_EINVAL;
+				goto fail_to_copy;
 			}
 		}
 		break;
@@ -713,10 +767,11 @@ xen_pal_emulator(unsigned long index, u6
 		       consumes 10 mW, implemented and cache/TLB coherent.  */
 		    unsigned long res = 1000UL | (1000UL << 16) | (10UL << 32)
 			    | (1UL << 61) | (1UL << 60);
-		    if (copy_to_user ((void *)in1, &res, sizeof (res)))
-			    status = PAL_STATUS_EINVAL;    
-		    else
-			    status = PAL_STATUS_SUCCESS;
+		    if (safe_copy_to_guest (in1, &res, sizeof (res))) {
+			    status = PAL_STATUS_EINVAL;
+			    goto fail_to_copy;
+		    }
+		    status = PAL_STATUS_SUCCESS;
 	        }
 		break;
 	    case PAL_HALT:
@@ -738,7 +793,35 @@ xen_pal_emulator(unsigned long index, u6
 		if (VMX_DOMAIN(current))
 			status = PAL_STATUS_SUCCESS;
 		break;
+	    case PAL_PREFETCH_VISIBILITY:
+		status = ia64_pal_prefetch_visibility(in1);
+		if (status == 0) {
+			/* must be performed on all remote processors 
+			   in the coherence domain. */
+			smp_call_function(remote_pal_prefetch_visibility,
+					  (void *)in1, 1, 1);
+			status = 1; /* no more necessary on remote processor */
+		}
+		break;
+	    case PAL_MC_DRAIN:
+		status = ia64_pal_mc_drain();
+		/* FIXME: All vcpus likely call PAL_MC_DRAIN.
+		   That causes the congestion. */
+		smp_call_function(remote_pal_mc_drain, NULL, 1, 1);
+		break;
+	    case PAL_BRAND_INFO:
+		if (in1 == 0) {
+			char brand_info[128];
+			status = ia64_pal_get_brand_info(brand_info);
+			if (status == PAL_STATUS_SUCCESS)
+				copy_to_user((void *)in2, brand_info, 128);
+		} else {
+			status = PAL_STATUS_EINVAL;
+		}
+		break;
 	    case PAL_LOGICAL_TO_PHYSICAL:
+	    case PAL_GET_PSTATE:
+	    case PAL_CACHE_SHARED_INFO:
 		/* Optional, no need to complain about being unimplemented */
 		break;
 	    default:
@@ -747,6 +830,12 @@ xen_pal_emulator(unsigned long index, u6
 		break;
 	}
 	return ((struct ia64_pal_retval) {status, r9, r10, r11});
+
+fail_to_copy:
+	gdprintk(XENLOG_WARNING,
+		"PAL(%ld) fail to copy!!! args 0x%lx 0x%lx 0x%lx\n",
+		index, in1, in2, in3);
+	return ((struct ia64_pal_retval) {status, r9, r10, r11});
 }
 
 // given a current domain (virtual or metaphysical) address, return the virtual address
@@ -1093,6 +1182,10 @@ efi_emulate_set_virtual_address_map(
 	efi_desc_size = sizeof(efi_memory_desc_t);
 
 	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		struct page_info *efi_runtime_page = NULL;
+		struct page_info *fpswa_inf_page = NULL;
+		struct page_info *fw_table_page = NULL;
+		
 		if (copy_from_user(&entry, p, sizeof(efi_memory_desc_t))) {
 			printk ("efi_emulate_set_virtual_address_map: copy_from_user() fault. addr=0x%p\n", p);
 			return EFI_UNSUPPORTED;
@@ -1102,6 +1195,27 @@ efi_emulate_set_virtual_address_map(
                 if (md->type != EFI_PAL_CODE)
                         continue;
 
+		/* get pages to prevend them from being freed 
+		 * during touching them.
+		 * those entres are in [FW_TABLES_BASE_PADDR, ...]
+		 * see dom_fw.h for its layout.
+		 */
+		efi_runtime_page = virt_to_page(efi_runtime);
+		fpswa_inf_page = virt_to_page(fpswa_inf);
+		fw_table_page = virt_to_page(
+			domain_mpa_to_imva(d, FW_TABLES_BASE_PADDR));
+		if (get_page(efi_runtime_page, d) == 0)
+			return EFI_INVALID_PARAMETER;
+		if (get_page(fpswa_inf_page, d) == 0) {
+			put_page(efi_runtime_page);
+			return EFI_INVALID_PARAMETER;
+		}
+		if (get_page(fw_table_page, d) == 0) {
+			put_page(fpswa_inf_page);
+			put_page(efi_runtime_page);
+			return EFI_INVALID_PARAMETER;
+		}
+
 #define EFI_HYPERCALL_PATCH_TO_VIRT(tgt,call) \
 	do { \
 		vfn = (unsigned long *) domain_mpa_to_imva(d, tgt); \
@@ -1124,6 +1238,10 @@ efi_emulate_set_virtual_address_map(
 		*vfn++ = FW_HYPERCALL_FPSWA_PATCH_INDEX * 16UL + md->virt_addr;
 		*vfn   = 0;
 		fpswa_inf->fpswa = (void *) (FW_HYPERCALL_FPSWA_ENTRY_INDEX * 16UL + md->virt_addr);
+
+		put_page(fw_table_page);
+		put_page(fpswa_inf_page);
+		put_page(efi_runtime_page);
 		break;
 	}
 
diff -Naurp xen/arch/ia64/xen/hypercall.c xen-redhat/arch/ia64/xen/hypercall.c
--- xen/arch/ia64/xen/hypercall.c
+++ xen-redhat/arch/ia64/xen/hypercall.c
@@ -17,6 +17,7 @@
 #include <asm/sal.h>	/* FOR struct ia64_sal_retval */
 #include <asm/fpswa.h>	/* FOR struct fpswa_ret_t */
 
+#include <asm/vmx.h>
 #include <asm/vmx_vcpu.h>
 #include <asm/vcpu.h>
 #include <asm/dom_fw.h>
@@ -121,14 +122,135 @@ fw_hypercall_ipi (struct pt_regs *regs)
 	return;
 }
 
+static int
+fpswa_get_domain_addr(struct vcpu *v, unsigned long gpaddr, size_t size,
+		      void **virt, struct page_info **page, const char *name)
+{
+	int cross_page_boundary;
+
+	if (gpaddr == 0) {
+		*virt = 0;
+		return 0;
+	}
+
+	cross_page_boundary = (((gpaddr & ~PAGE_MASK) + size) > PAGE_SIZE);
+	if (unlikely(cross_page_boundary)) {
+		/* this case isn't implemented */
+		gdprintk(XENLOG_ERR,
+			 "%s: fpswa hypercall is called with "
+			 "page crossing argument %s 0x%lx\n",
+			 __func__, name, gpaddr);
+		return -ENOSYS;
+	}
+
+again:
+        *virt = domain_mpa_to_imva(v->domain, gpaddr);
+        *page = virt_to_page(*virt);
+        if (get_page(*page, current->domain) == 0) {
+                if (page_get_owner(*page) != current->domain) {
+			*page = NULL;
+			return -EFAULT;
+		}
+                goto again;
+        }
+
+	return 0;
+}
+
 static fpswa_ret_t
-fw_hypercall_fpswa (struct vcpu *v)
+fw_hypercall_fpswa (struct vcpu *v, struct pt_regs *regs)
 {
-	return PSCBX(v, fpswa_ret);
+	fpswa_ret_t ret = {-1, 0, 0, 0};
+	unsigned long bundle[2] = { regs->r15, regs->r16};
+	fp_state_t fp_state;
+	struct page_info *lp_page = NULL;
+	struct page_info *lv_page = NULL;
+	struct page_info *hp_page = NULL;
+	struct page_info *hv_page = NULL;
+
+ 	if (unlikely(PSCBX(v, fpswa_ret).status != 0 && 
+ 		     PSCBX(v, fpswa_ret).status != IA64_RETRY)) {
+ 		ret = PSCBX(v, fpswa_ret);
+ 		PSCBX(v, fpswa_ret) = (fpswa_ret_t){0, 0, 0, 0};
+ 		return ret;
+ 	}
+
+	if (!fpswa_interface)
+		goto error;
+
+	memset(&fp_state, 0, sizeof(fp_state));
+	fp_state.bitmask_low64 = regs->r22;
+	fp_state.bitmask_high64 = regs->r23;
+
+	/* bit6..bit11 */
+	if ((fp_state.bitmask_low64 & 0xfc0) != 0xfc0) {
+		/* other cases isn't supported yet */
+		gdprintk(XENLOG_ERR, "%s unsupported bitmask_low64 0x%lx\n",
+			 __func__, fp_state.bitmask_low64);
+		goto error;
+	}
+	if (regs->r25 == 0)
+		/* fp_state.fp_state_low_volatile must be supplied */
+		goto error;
+
+	/* eager save/lazy restore fpu: f32...f127 */
+	if ((~fp_state.bitmask_low64 & ((1UL << 31) - 1)) != 0 ||
+	    ~fp_state.bitmask_high64 != 0) {
+		if (VMX_DOMAIN(v))
+			vmx_lazy_load_fpu(v);
+		else
+			ia64_lazy_load_fpu(v);
+	}
+
+	if (fpswa_get_domain_addr(v, regs->r24,
+				  sizeof(fp_state.fp_state_low_preserved), 
+				  (void*)&fp_state.fp_state_low_preserved,
+				  &lp_page, "fp_state_low_preserved") < 0)
+		goto error;
+	if (fpswa_get_domain_addr(v, regs->r25,
+				  sizeof(fp_state.fp_state_low_volatile),
+				  (void*)&fp_state.fp_state_low_volatile,
+				  &lv_page, "fp_state_low_volatile") < 0)
+		goto error;
+	if (fpswa_get_domain_addr(v, regs->r26,
+				  sizeof(fp_state.fp_state_high_preserved),
+				  (void*)&fp_state.fp_state_high_preserved,
+				  &hp_page, "fp_state_low_preserved") < 0)
+		goto error;
+	if (fpswa_get_domain_addr(v, regs->r27,
+				  sizeof(fp_state.fp_state_high_volatile),
+				  (void*)&fp_state.fp_state_high_volatile,
+				  &hv_page, "fp_state_high_volatile") < 0)
+		goto error;
+
+	ret = (*fpswa_interface->fpswa)(regs->r14,
+					bundle,
+					&regs->r17,	/* pipsr */
+					&regs->r18,	/* pfsr */
+					&regs->r19,	/* pisr */
+					&regs->r20,	/* ppreds */
+					&regs->r21,	/* pifs	*/
+					&fp_state);
+
+error:
+	if (lp_page != NULL)
+		put_page(lp_page);
+	if (lv_page != NULL)
+		put_page(lv_page);
+	if (hp_page != NULL)
+		put_page(hp_page);
+	if (hv_page != NULL)
+		put_page(hv_page);
+	return ret;
 }
 
-IA64FAULT
-ia64_hypercall(struct pt_regs *regs)
+static fpswa_ret_t
+fw_hypercall_fpswa_error(void)
+{
+	return (fpswa_ret_t) {-1, 0, 0, 0};
+}
+
+IA64FAULT ia64_hypercall(struct pt_regs *regs)
 {
 	struct vcpu *v = current;
 	struct sal_ret_values x;
@@ -177,6 +299,9 @@ ia64_hypercall(struct pt_regs *regs)
 				/* do_block only pends a softirq */
 				do_softirq();
 				stop_timer(&v->arch.hlt_timer);
+				/* do_block() call local_event_delivery_enable(),
+				   but PALL CALL is always called with psr.i =  */
+				local_event_delivery_disable();
 			}
 			regs->r8 = 0;
 			regs->r9 = 0;
@@ -221,8 +346,24 @@ ia64_hypercall(struct pt_regs *regs)
 	case FW_HYPERCALL_SET_SHARED_INFO_VA:
 	        regs->r8 = domain_set_shared_info_va (regs->r28);
 		break;
-	case FW_HYPERCALL_FPSWA:
-		fpswa_ret = fw_hypercall_fpswa (v);
+	case FW_HYPERCALL_FPSWA_BASE:
+		switch (regs->r2) {
+		case FW_HYPERCALL_FPSWA_BROKEN:
+			gdprintk(XENLOG_WARNING,
+				 "Old fpswa hypercall was called (0x%lx).\n"
+				 "Please update your domain builder. ip 0x%lx\n",
+				 FW_HYPERCALL_FPSWA_BROKEN, regs->cr_iip);
+			fpswa_ret = fw_hypercall_fpswa_error();
+			break;
+		case FW_HYPERCALL_FPSWA:
+			fpswa_ret = fw_hypercall_fpswa(v, regs);
+			break;
+		default:
+			gdprintk(XENLOG_ERR, "unknown fpswa hypercall %lx\n",
+				 regs->r2);
+			fpswa_ret = fw_hypercall_fpswa_error();
+			break;
+		}
 		regs->r8  = fpswa_ret.status;
 		regs->r9  = fpswa_ret.err0;
 		regs->r10 = fpswa_ret.err1;
diff -Naurp xen/arch/ia64/xen/irq.c xen-redhat/arch/ia64/xen/irq.c
--- xen/arch/ia64/xen/irq.c
+++ xen-redhat/arch/ia64/xen/irq.c
@@ -467,7 +467,7 @@ int pirq_guest_bind(struct vcpu *v, int 
     return rc;
 }
 
-int pirq_guest_unbind(struct domain *d, int irq)
+void pirq_guest_unbind(struct domain *d, int irq)
 {
     irq_desc_t         *desc = &irq_desc[irq];
     irq_guest_action_t *action;
@@ -501,7 +501,6 @@ int pirq_guest_unbind(struct domain *d, 
     }
 
     spin_unlock_irqrestore(&desc->lock, flags);    
-    return 0;
 }
 
 void
diff -Naurp xen/arch/ia64/xen/ivt.S xen-redhat/arch/ia64/xen/ivt.S
--- xen/arch/ia64/xen/ivt.S
+++ xen-redhat/arch/ia64/xen/ivt.S
@@ -977,10 +977,17 @@ ENTRY(daccess_bit)
 #ifdef XEN
 	mov r16=cr.isr
 	mov r17=cr.ifa
+	mov r18=cr.ipsr
 	mov r31=pr
 	mov r19=10
+	;;
 	mov r20=0x2800
-	br.sptk.many fast_access_reflect
+	extr.u r18=r18,IA64_PSR_CPL0_BIT,2
+	;;
+	cmp.ne p6,p0=r0,r18 	/* cpl != 0? */
+(p6)	br.sptk.many fast_access_reflect
+	/* __domain_get_bundle() may cause this fault. */
+	br.sptk.few dispatch_to_fault_handler
 	;;
 #else
 	// Like Entry 8, except for data access
@@ -1230,6 +1237,7 @@ fast_hypercall:
 	nop 0
 	bsw.1					// B (6 cyc) regs are saved, switch to bank 1
 	;;
+	PT_REGS_UNWIND_INFO(-48)
 
 	ssm psr.ic | PSR_DEFAULT_BITS		// M2	now it's safe to re-enable intr.-collection
 //	movl r3=ia64_ret_from_syscall		// X
@@ -2103,11 +2111,7 @@ END(speculation_vector)
 // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
 ENTRY(debug_vector)
 	DBG_FAULT(29)
-#ifdef XEN
 	FAULT_OR_REFLECT(29)
-#else
-	FAULT(29)
-#endif
 END(debug_vector)
 
 	.org ia64_ivt+0x5a00
diff -Naurp xen/arch/ia64/xen/machine_kexec.c xen-redhat/arch/ia64/xen/machine_kexec.c
--- xen/arch/ia64/xen/machine_kexec.c
+++ xen-redhat/arch/ia64/xen/machine_kexec.c
@@ -1,6 +1,9 @@
 #include <xen/lib.h>       /* for printk() used in stubs */
 #include <xen/types.h>
 #include <public/kexec.h>
+#include <xen/mm.h>
+
+extern unsigned long frametable_pg_dir[];
 
 int machine_kexec_load(int type, int slot, xen_kexec_image_t *image)
 {
@@ -23,6 +26,14 @@ void machine_reboot_kexec(xen_kexec_imag
     printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
 }
 
+void arch_crash_save_vmcoreinfo(void)
+{
+    VMCOREINFO_SYMBOL(dom_xen);
+    VMCOREINFO_SYMBOL(dom_io);
+    VMCOREINFO_SYMBOL(xen_pstart);
+    VMCOREINFO_SYMBOL(frametable_pg_dir);
+}
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/ia64/xen/Makefile xen-redhat/arch/ia64/xen/Makefile
--- xen/arch/ia64/xen/Makefile
+++ xen-redhat/arch/ia64/xen/Makefile
@@ -31,6 +31,7 @@ obj-y += flushd.o
 obj-y += privop_stat.o
 obj-y += xenpatch.o
 obj-y += xencomm.o
+obj-y += pci.o
 
 obj-$(crash_debug) += gdbstub.o
 obj-$(xen_ia64_tlb_track) += tlb_track.o
diff -Naurp xen/arch/ia64/xen/mm.c xen-redhat/arch/ia64/xen/mm.c
--- xen/arch/ia64/xen/mm.c
+++ xen-redhat/arch/ia64/xen/mm.c
@@ -172,6 +172,7 @@
 #include <asm/vhpt.h>
 #include <asm/vcpu.h>
 #include <asm/shadow.h>
+#include <asm/event.h>
 #include <asm/p2m_entry.h>
 #include <asm/tlb_track.h>
 #include <linux/efi.h>
@@ -182,10 +183,11 @@
 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
                                       volatile pte_t* ptep, pte_t old_pte, 
                                       struct page_info* page);
+static int efi_ucwb(unsigned long physaddr, unsigned long size);
 
 extern unsigned long ia64_iobase;
 
-static struct domain *dom_xen, *dom_io;
+struct domain *dom_xen, *dom_io;
 
 // followings are stolen from arch_init_memory() @ xen/arch/x86/mm.c
 void
@@ -208,6 +210,18 @@ alloc_dom_xen_and_dom_io(void)
     BUG_ON(dom_io == NULL);
 }
 
+static int
+mm_teardown_can_skip(struct domain* d, unsigned long offset)
+{
+    return d->arch.mm_teardown_offset > offset;
+}
+
+static void
+mm_teardown_update_offset(struct domain* d, unsigned long offset)
+{
+    d->arch.mm_teardown_offset = offset;
+}
+
 static void
 mm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset)
 {
@@ -248,46 +262,73 @@ mm_teardown_pte(struct domain* d, volati
     }
 }
 
-static void
+static int
 mm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset)
 {
     unsigned long i;
     volatile pte_t* pte = pte_offset_map(pmd, offset);
 
     for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
-        if (!pte_present(*pte)) // acquire semantics
+        unsigned long cur_offset = offset + (i << PAGE_SHIFT);
+        if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE))
+            continue;
+        if (!pte_present(*pte)) { // acquire semantics
+            mm_teardown_update_offset(d, cur_offset);
             continue;
-        mm_teardown_pte(d, pte, offset + (i << PAGE_SHIFT));
+        }
+        mm_teardown_update_offset(d, cur_offset);
+        mm_teardown_pte(d, pte, cur_offset);
+        if (hypercall_preempt_check())
+            return -EAGAIN;
     }
+    return 0;
 }
 
-static void
+static int
 mm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset)
 {
     unsigned long i;
     volatile pmd_t *pmd = pmd_offset(pud, offset);
 
     for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
-        if (!pmd_present(*pmd)) // acquire semantics
+        unsigned long cur_offset = offset + (i << PMD_SHIFT);
+        if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE))
+            continue;
+        if (!pmd_present(*pmd)) { // acquire semantics
+            mm_teardown_update_offset(d, cur_offset);
             continue;
-        mm_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT));
+        }
+        if (mm_teardown_pmd(d, pmd, cur_offset))
+            return -EAGAIN;
     }
+    return 0;
 }
 
-static void
+static int
 mm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset)
 {
     unsigned long i;
     volatile pud_t *pud = pud_offset(pgd, offset);
 
     for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
-        if (!pud_present(*pud)) // acquire semantics
+        unsigned long cur_offset = offset + (i << PUD_SHIFT);
+#ifndef __PAGETABLE_PUD_FOLDED
+        if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE))
             continue;
-        mm_teardown_pud(d, pud, offset + (i << PUD_SHIFT));
+#endif
+        if (!pud_present(*pud)) { // acquire semantics
+#ifndef __PAGETABLE_PUD_FOLDED
+            mm_teardown_update_offset(d, cur_offset);
+#endif
+            continue;
+        }
+        if (mm_teardown_pud(d, pud, cur_offset))
+            return -EAGAIN;
     }
+    return 0;
 }
 
-void
+int
 mm_teardown(struct domain* d)
 {
     struct mm_struct* mm = &d->arch.mm;
@@ -295,14 +336,22 @@ mm_teardown(struct domain* d)
     volatile pgd_t* pgd;
 
     if (mm->pgd == NULL)
-        return;
+        return 0;
 
     pgd = pgd_offset(mm, 0);
     for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
-        if (!pgd_present(*pgd)) // acquire semantics
+        unsigned long cur_offset = i << PGDIR_SHIFT;
+
+        if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE))
             continue;
-        mm_teardown_pgd(d, pgd, i << PGDIR_SHIFT);
+        if (!pgd_present(*pgd)) { // acquire semantics
+            mm_teardown_update_offset(d, cur_offset);
+            continue;
+        }
+        if (mm_teardown_pgd(d, pgd, cur_offset))
+            return -EAGAIN;
     }
+    return 0;
 }
 
 static void
@@ -492,7 +541,9 @@ u64 translate_domain_pte(u64 pteval, u64
 			   This can happen when domU tries to touch i/o
 			   port space.  Also prevents possible address
 			   aliasing issues.  */
-			if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE))
+			if (!(mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) &&
+			    /* and also except UC|WB page */
+			    (d != dom0 || !efi_ucwb(mpaddr, PAGE_SIZE))) 
 				gdprintk(XENLOG_WARNING, "Warning: UC to WB "
 				         "for mpaddr=%lx\n", mpaddr);
 			pteval = (pteval & ~_PAGE_MA_MASK) | _PAGE_MA_WB;
@@ -666,19 +717,22 @@ unsigned long lookup_domain_mpa(struct d
             return GPFN_INV_MASK;
     }
 
-    if (mpaddr < d->arch.convmem_end) {
+    if (mpaddr < d->arch.convmem_end && !d->is_dying) {
         gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: non-allocated mpa "
-                 "0x%lx (< 0x%lx)\n", current->vcpu_id, PSCB(current, iip),
-                 mpaddr, d->arch.convmem_end);
+                 "d %"PRId16" 0x%lx (< 0x%lx)\n",
+                 current->vcpu_id, PSCB(current, iip),
+                 d->domain_id, mpaddr, d->arch.convmem_end);
     } else if (mpaddr - IO_PORTS_PADDR < IO_PORTS_SIZE) {
         /* Log I/O port probing, but complain less loudly about it */
         gdprintk(XENLOG_INFO, "vcpu %d iip 0x%016lx: bad I/O port access "
-                 "0x%lx\n", current->vcpu_id, PSCB(current, iip),
+                 "d %"PRId16" 0x%lx\n",
+                 current->vcpu_id, PSCB(current, iip), d->domain_id,
                  IO_SPACE_SPARSE_DECODING(mpaddr - IO_PORTS_PADDR));
     } else {
-        gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa 0x%lx "
-                 "(=> 0x%lx)\n", current->vcpu_id, PSCB(current, iip),
-                 mpaddr, d->arch.convmem_end);
+        gdprintk(XENLOG_WARNING, "vcpu %d iip 0x%016lx: bad mpa "
+                 "d %"PRId16" 0x%lx (=> 0x%lx)\n",
+                 current->vcpu_id, PSCB(current, iip),
+                 d->domain_id, mpaddr, d->arch.convmem_end);
     }
 
     if (entry != NULL)
@@ -834,15 +888,43 @@ __assign_domain_page(struct domain *d,
     // dom0 tries to map real machine's I/O region, but failed.
     // It is very likely that dom0 doesn't boot correctly because
     // it can't access I/O. So complain here.
-    if ((flags & ASSIGN_nocache) &&
-        (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT) ||
-         !(pte_val(ret_pte) & _PAGE_MA_UC)))
-        printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
-               "\talready assigned pte_val 0x%016lx\n"
-               "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
-               __func__, __LINE__,
-               d, d->domain_id, pte_val(ret_pte),
-               mpaddr, physaddr, flags);
+    if (flags & ASSIGN_nocache) {
+        int warn = 0;
+
+        if (pte_pfn(ret_pte) != (physaddr >> PAGE_SHIFT))
+            warn = 1;
+        else if (!(pte_val(ret_pte) & _PAGE_MA_UC)) {
+            u32 type;
+            u64 attr;
+
+            warn = 1;
+
+            /*
+             * See
+             * complete_dom0_memmap()
+             * case EFI_RUNTIME_SERVICES_CODE:
+             * case EFI_RUNTIME_SERVICES_DATA:
+             * case EFI_ACPI_RECLAIM_MEMORY:
+             * case EFI_ACPI_MEMORY_NVS:
+             * case EFI_RESERVED_TYPE:
+             * 
+             * Currently only EFI_RUNTIME_SERVICES_CODE is found
+             * so that we suppress only EFI_RUNTIME_SERVICES_CODE case.
+             */
+            type = efi_mem_type(physaddr);
+            attr = efi_mem_attributes(physaddr);
+            if (type == EFI_RUNTIME_SERVICES_CODE &&
+                (attr & EFI_MEMORY_UC) && (attr & EFI_MEMORY_WB))
+                warn = 0;
+        }
+        if (warn)
+            printk("%s:%d WARNING can't assign page domain 0x%p id %d\n"
+                   "\talready assigned pte_val 0x%016lx\n"
+                   "\tmpaddr 0x%016lx physaddr 0x%016lx flags 0x%lx\n",
+                   __func__, __LINE__,
+                   d, d->domain_id, pte_val(ret_pte),
+                   mpaddr, physaddr, flags);
+    }
 
     return -EAGAIN;
 }
@@ -1020,6 +1102,46 @@ assign_domain_same_page(struct domain *d
     }
 }
 
+static int
+efi_ucwb(unsigned long physaddr, unsigned long size)
+{
+    void *efi_map_start, *efi_map_end;
+    u64 efi_desc_size;
+    void* p;
+
+    efi_map_start = __va(ia64_boot_param->efi_memmap);
+    efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+    efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+    for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+        efi_memory_desc_t* md = (efi_memory_desc_t *)p;
+        unsigned long start = md->phys_addr;
+        unsigned long end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+
+        if (start <= physaddr && physaddr < end) {
+            if ((physaddr + size) > end) {
+                gdprintk(XENLOG_INFO, "%s: physaddr 0x%lx size = 0x%lx\n",
+                        __func__, physaddr, size);
+                return 0;
+            }
+
+            // for UC|WB space
+            if( md->attribute & EFI_MEMORY_WB &&
+                md->attribute & EFI_MEMORY_UC )
+                    return 1;
+
+            return 0;
+        }
+
+        if (physaddr < start) {
+            break;
+        }
+    }
+
+    return 0;
+}
+
+
 int
 efi_mmio(unsigned long physaddr, unsigned long size)
 {
@@ -1760,28 +1882,40 @@ steal_page(struct domain *d, struct page
     return 0;
 }
 
-void
+int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, int order)
 {
-    BUG_ON(!mfn_valid(mfn));
-    BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
-    set_gpfn_from_mfn(mfn, gpfn);
-    smp_mb();
-    assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
-                               ASSIGN_writable | ASSIGN_pgc_allocated);
+    unsigned long i;
+    for ( i = 0; i < ( 1UL << order); i++)
+    {
+        BUG_ON(!mfn_valid(mfn));
+        BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
+        set_gpfn_from_mfn(mfn, gpfn);
+        smp_mb();
+        assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
+          ASSIGN_writable | ASSIGN_pgc_allocated);
 
-    //BUG_ON(mfn != ((lookup_domain_mpa(d, gpfn << PAGE_SHIFT) & _PFN_MASK) >> PAGE_SHIFT));
+        mfn++;
+        gpfn++;
+    }
 
     perfc_incr(guest_physmap_add_page);
+
+    return 0;
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, int order)
 {
+    unsigned long i;
+
     BUG_ON(mfn == 0);//XXX
-    zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
+
+    for ( i = 0; i < (1UL << order); i++ )
+        zap_domain_page_one(d, (gpfn+i) << PAGE_SHIFT, 0, mfn+i);
+
     perfc_incr(guest_physmap_remove_page);
 }
 
@@ -2183,7 +2317,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_frame(mfn_to_page(prev_mfn)))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2192,10 +2326,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         UNLOCK_BIGLOCK(d);
diff -Naurp xen/arch/ia64/xen/oprofile/perfmon.c xen-redhat/arch/ia64/xen/oprofile/perfmon.c
--- xen/arch/ia64/xen/oprofile/perfmon.c
+++ xen-redhat/arch/ia64/xen/oprofile/perfmon.c
@@ -85,6 +85,7 @@ static char * get_cpu_type(void)
         case 0x07:
             return "ia64/itanium";
         case 0x1f:
+        case 0x20:
             return "ia64/itanium2";
         default:
             return "ia64/ia64";
diff -Naurp xen/arch/ia64/xen/pci.c xen-redhat/arch/ia64/xen/pci.c
--- xen/arch/ia64/xen/pci.c
+++ xen-redhat/arch/ia64/xen/pci.c
@@ -0,0 +1,134 @@
+/*
+ * pci.c - Low-Level PCI Access in IA-64
+ *
+ * Derived from bios32.c of i386 tree.
+ *
+ * (c) Copyright 2002, 2005 Hewlett-Packard Development Company, L.P.
+ *  David Mosberger-Tang <davidm@hpl.hp.com>
+ * Bjorn Helgaas <bjorn.helgaas@hp.com>
+ * Copyright (C) 2004 Silicon Graphics, Inc.
+ *
+ * Note: Above list of copyright holders is incomplete...
+ */
+
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/spinlock.h>
+
+#include <asm/io.h>
+#include <asm/sal.h>
+#include <asm/hw_irq.h>
+
+/*
+ * Low-level SAL-based PCI configuration access functions. Note that SAL
+ * calls are already serialized (via sal_lock), so we don't need another
+ * synchronization mechanism here.
+ */
+
+#define PCI_SAL_ADDRESS(seg, bus, devfn, reg)       \
+    (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
+
+/* SAL 3.2 adds support for extended config space. */
+
+#define PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg)   \
+    (((u64) seg << 28) | (bus << 20) | (devfn << 12) | (reg))
+
+static int
+pci_sal_read (unsigned int seg, unsigned int bus, unsigned int devfn,
+        int reg, int len, u32 *value)
+{
+    u64 addr, data = 0;
+    int mode, result;
+
+    if (!value || (seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095))
+        return -EINVAL;
+
+    if ((seg | reg) <= 255) {
+        addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg);
+        mode = 0;
+    } else {
+        addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg);
+        mode = 1;
+    }
+    result = ia64_sal_pci_config_read(addr, mode, len, &data);
+    if (result != 0)
+        return -EINVAL;
+
+    *value = (u32) data;
+    return 0;
+}
+
+static int
+pci_sal_write (unsigned int seg, unsigned int bus, unsigned int devfn,
+        int reg, int len, u32 value)
+{
+    u64 addr;
+    int mode, result;
+
+    if ((seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095))
+        return -EINVAL;
+
+    if ((seg | reg) <= 255) {
+        addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg);
+        mode = 0;
+    } else {
+        addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg);
+        mode = 1;
+    }
+    result = ia64_sal_pci_config_write(addr, mode, len, value);
+    if (result != 0)
+        return -EINVAL;
+    return 0;
+}
+
+
+uint8_t pci_conf_read8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    uint32_t value;
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_sal_read(0, bus, (dev<<3)|func, reg, 1, &value);
+    return (uint8_t)value;
+}
+
+uint16_t pci_conf_read16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    uint32_t value;
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_sal_read(0, bus, (dev<<3)|func, reg, 2, &value);
+    return (uint16_t)value;
+}
+
+uint32_t pci_conf_read32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    uint32_t value;
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_sal_read(0, bus, (dev<<3)|func, reg, 4, &value);
+    return (uint32_t)value;
+}
+
+void pci_conf_write8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint8_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_sal_write(0, bus, (dev<<3)|func, reg, 1, data);
+}
+
+void pci_conf_write16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint16_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_sal_write(0, bus, (dev<<3)|func, reg, 2, data);
+}
+
+void pci_conf_write32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint32_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_sal_write(0, bus, (dev<<3)|func, reg, 4, data);
+}
diff -Naurp xen/arch/ia64/xen/regionreg.c xen-redhat/arch/ia64/xen/regionreg.c
--- xen/arch/ia64/xen/regionreg.c
+++ xen-redhat/arch/ia64/xen/regionreg.c
@@ -270,8 +270,16 @@ int set_one_rr(unsigned long rr, unsigne
 	return 1;
 }
 
+void set_virtual_rr0(void)
+{
+	struct vcpu *v = current;
+
+	ia64_set_rr(0, v->arch.metaphysical_saved_rr0);
+	ia64_srlz_d();
+}
+
 // set rr0 to the passed rid (for metaphysical mode so don't use domain offset
-int set_metaphysical_rr0(void)
+void set_metaphysical_rr0(void)
 {
 	struct vcpu *v = current;
 //	ia64_rr rrv;
@@ -279,7 +287,6 @@ int set_metaphysical_rr0(void)
 //	rrv.ve = 1; 	FIXME: TURN ME BACK ON WHEN VHPT IS WORKING
 	ia64_set_rr(0,v->arch.metaphysical_rr0);
 	ia64_srlz_d();
-	return 1;
 }
 
 void init_all_rr(struct vcpu *v)
diff -Naurp xen/arch/ia64/xen/vcpu.c xen-redhat/arch/ia64/xen/vcpu.c
--- xen/arch/ia64/xen/vcpu.c
+++ xen-redhat/arch/ia64/xen/vcpu.c
@@ -173,6 +173,11 @@ void vcpu_init_regs(struct vcpu *v)
 		    (unsigned char *)v->domain->arch.shared_info_va +
 		    INT_ENABLE_OFFSET(v);
 		VCPU(v, itv) = (1 << 16);	/* timer vector masked */
+
+		/* SAL specification 3.2.4 */
+		VCPU(v, vpsr) = IA64_PSR_AC | IA64_PSR_IC | IA64_PSR_BN;
+		v->vcpu_info->evtchn_upcall_pending = 0;
+		v->vcpu_info->evtchn_upcall_mask = -1;
 	}
 
 	/* pta.size must not be 0.  The minimum is 15 (32k) */
@@ -234,7 +239,7 @@ IA64FAULT vcpu_get_ar(VCPU * vcpu, u64 r
  VCPU processor status register access routines
 **************************************************************************/
 
-void vcpu_set_metaphysical_mode(VCPU * vcpu, BOOLEAN newmode)
+static void vcpu_set_metaphysical_mode(VCPU * vcpu, BOOLEAN newmode)
 {
 	/* only do something if mode changes */
 	if (!!newmode ^ !!PSCB(vcpu, metaphysical_mode)) {
@@ -242,7 +247,7 @@ void vcpu_set_metaphysical_mode(VCPU * v
 		if (newmode)
 			set_metaphysical_rr0();
 		else if (PSCB(vcpu, rrs[0]) != -1)
-			set_one_rr(0, PSCB(vcpu, rrs[0]));
+			set_virtual_rr0();
 	}
 }
 
@@ -392,54 +397,35 @@ IA64FAULT vcpu_set_psr_l(VCPU * vcpu, u6
 
 	newpsr = *(struct ia64_psr *)&val;
 	ipsr = (struct ia64_psr *)&regs->cr_ipsr;
-	// just handle psr.up and psr.pp for now
-	//if (val & ~(IA64_PSR_PP | IA64_PSR_UP | IA64_PSR_SP))
-	//	return IA64_ILLOP_FAULT;
-	// however trying to set other bits can't be an error as it is in ssm
-	if (newpsr.dfh) {
-		ipsr->dfh = 1;
-		PSCB(vcpu, vpsr_dfh) = 1;
-	} else {
-		ipsr->dfh = PSCB(vcpu, hpsr_dfh);
-		PSCB(vcpu, vpsr_dfh) = 0;
-	}       
-	if (newpsr.dfl)
-		ipsr->dfl = 1;
-	if (newpsr.pp) {
-		ipsr->pp = 1;
-		PSCB(vcpu, vpsr_pp) = 1;
-	} else {
-		ipsr->pp = 1;
-		PSCB(vcpu, vpsr_pp) = 0;
-	}
-	if (newpsr.up)
-		ipsr->up = 1;
-	if (newpsr.sp)
-		ipsr->sp = 1;
-	if (newpsr.i) {
-		if (vcpu->vcpu_info->evtchn_upcall_mask)
-			enabling_interrupts = 1;
-		vcpu->vcpu_info->evtchn_upcall_mask = 0;
-	}
-	if (newpsr.ic)
-		PSCB(vcpu, interrupt_collection_enabled) = 1;
-	if (newpsr.mfl)
-		ipsr->mfl = 1;
-	if (newpsr.mfh)
-		ipsr->mfh = 1;
-	if (newpsr.ac)
-		ipsr->ac = 1;
-	if (newpsr.up)
-		ipsr->up = 1;
-	if (newpsr.dt && newpsr.rt)
-		vcpu_set_metaphysical_mode(vcpu, FALSE);
-	else
-		vcpu_set_metaphysical_mode(vcpu, TRUE);
-	if (newpsr.be)
-		ipsr->be = 1;
+
+	ipsr->be = newpsr.be;
+	ipsr->up = newpsr.up;
+	ipsr->ac = newpsr.ac;
+	ipsr->mfl = newpsr.mfl;
+	ipsr->mfh = newpsr.mfh;
+
+	PSCB(vcpu, interrupt_collection_enabled) = newpsr.ic;
+
+	if (newpsr.i && vcpu->vcpu_info->evtchn_upcall_mask)
+		enabling_interrupts = 1;
+
+	vcpu->vcpu_info->evtchn_upcall_mask = !(newpsr.i);
+
+	vcpu_set_metaphysical_mode(vcpu, !(newpsr.dt && newpsr.rt));
+
+	ipsr->dfl = newpsr.dfl;
+	PSCB(vcpu, vpsr_dfh) = newpsr.dfh;
+	ipsr->dfh = newpsr.dfh ? 1 : PSCB(vcpu, hpsr_dfh);
+
+	ipsr->sp = newpsr.sp;
+
+	/* xenoprof: Don't change ipsr->pp, it is manipulated by xenoprof */
+	PSCB(vcpu, vpsr_pp) = newpsr.pp;
+
 	if (enabling_interrupts &&
 	    vcpu_check_pending_interrupts(vcpu) != SPURIOUS_VECTOR)
 		PSCB(vcpu, pending_interruption) = 1;
+
 	return IA64_NO_FAULT;
 }
 
@@ -1330,21 +1316,21 @@ IA64FAULT vcpu_rfi(VCPU * vcpu)
 {
 	// TODO: Only allowed for current vcpu
 	PSR psr;
-	u64 int_enable, ifs;
+	u64 ifs, int_enable, psr_ic;
 	REGS *regs = vcpu_regs(vcpu);
 
 	psr.i64 = PSCB(vcpu, ipsr);
+	int_enable = psr.ia64_psr.i;
+	psr_ic = psr.ia64_psr.ic;
 	if (psr.ia64_psr.cpl < 3)
 		psr.ia64_psr.cpl = 2;
-	int_enable = psr.ia64_psr.i;
+
 	if (psr.ia64_psr.dfh) {
 		PSCB(vcpu, vpsr_dfh) = 1;
 	} else {
 		psr.ia64_psr.dfh = PSCB(vcpu, hpsr_dfh);
 		PSCB(vcpu, vpsr_dfh) = 0;
 	}
-	if (psr.ia64_psr.ic)
-		PSCB(vcpu, interrupt_collection_enabled) = 1;
 	if (psr.ia64_psr.dt && psr.ia64_psr.rt && psr.ia64_psr.it)
 		vcpu_set_metaphysical_mode(vcpu, FALSE);
 	else
@@ -1363,9 +1349,9 @@ IA64FAULT vcpu_rfi(VCPU * vcpu)
 
 	regs->cr_ipsr = psr.i64;
 	regs->cr_iip = PSCB(vcpu, iip);
-	PSCB(vcpu, interrupt_collection_enabled) = 1;
 	vcpu_bsw1(vcpu);
-	vcpu->vcpu_info->evtchn_upcall_mask = !int_enable;
+	PSCB(vcpu, interrupt_collection_enabled) = !!(psr_ic);
+	vcpu->vcpu_info->evtchn_upcall_mask = !(int_enable);
 	return IA64_NO_FAULT;
 }
 
@@ -1526,6 +1512,26 @@ vcpu_get_domain_bundle(VCPU * vcpu, REGS
 		// copy its value to the variable, tr, before use.
 		TR_ENTRY tr;
 
+		// fast path:
+		// try to access gip with guest virtual address directly.
+		// This may cause tlb miss. see vcpu_translate(). Be careful!
+		swap_rr0 = (!region && PSCB(vcpu, metaphysical_mode));
+		if (swap_rr0) {
+			set_virtual_rr0();
+		}
+		*bundle = __get_domain_bundle(gip);
+		if (swap_rr0) {
+			set_metaphysical_rr0();
+		}
+		
+		if (!bundle->i64[0] && !bundle->i64[1]) {
+			dprintk(XENLOG_INFO, "%s gip 0x%lx\n", __func__, gip);
+		} else {
+			// Okay, mDTC successed
+			return 1;
+		}
+		// mDTC failed, so try vTLB.
+
 		trp = vcpu_tr_lookup(vcpu, gip, rid, 0);
 		if (trp != NULL) {
 			tr = *trp;
@@ -1545,28 +1551,13 @@ vcpu_get_domain_bundle(VCPU * vcpu, REGS
 			tr = *trp;
 			goto found;
 		}
-#if 0
 		tr = PSCBX(vcpu, dtlb);
 		if (vcpu_match_tr_entry(&tr, gip, rid)) {
 			goto found;
 		}
-#endif
 
-		// try to access gip with guest virtual address
-		// This may cause tlb miss. see vcpu_translate(). Be careful!
-		swap_rr0 = (!region && PSCB(vcpu, metaphysical_mode));
-		if (swap_rr0) {
-			set_one_rr(0x0, PSCB(vcpu, rrs[0]));
-		}
-		*bundle = __get_domain_bundle(gip);
-		if (swap_rr0) {
-			set_metaphysical_rr0();
-		}
-		if (bundle->i64[0] == 0 && bundle->i64[1] == 0) {
-			dprintk(XENLOG_INFO, "%s gip 0x%lx\n", __func__, gip);
-			return 0;
-		}
-		return 1;
+		// mDTC and vTLB failed. so reflect tlb miss into the guest.
+		return 0;
 
 	found:
 		gpip = ((tr.pte.ppn >> (tr.ps - 12)) << tr.ps) |
@@ -1744,33 +1735,65 @@ IA64FAULT vcpu_tak(VCPU * vcpu, u64 vadr
 
 IA64FAULT vcpu_set_dbr(VCPU * vcpu, u64 reg, u64 val)
 {
-	// TODO: unimplemented DBRs return a reserved register fault
-	// TODO: Should set Logical CPU state, not just physical
-	ia64_set_dbr(reg, val);
+	if (reg >= IA64_NUM_DBG_REGS)
+		return IA64_RSVDREG_FAULT;
+	if ((reg & 1) == 0) {
+		/* Validate address. */
+		if (val >= HYPERVISOR_VIRT_START && val <= HYPERVISOR_VIRT_END)
+			return IA64_ILLOP_FAULT;
+	} else {
+		if (!VMX_DOMAIN(vcpu)) {
+			/* Mask PL0. */
+			val &= ~(1UL << 56);
+		}
+	}
+	if (val != 0)
+		vcpu->arch.dbg_used |= (1 << reg);
+	else
+		vcpu->arch.dbg_used &= ~(1 << reg);
+	vcpu->arch.dbr[reg] = val;
+	if (vcpu == current)
+		ia64_set_dbr(reg, val);
 	return IA64_NO_FAULT;
 }
 
 IA64FAULT vcpu_set_ibr(VCPU * vcpu, u64 reg, u64 val)
 {
-	// TODO: unimplemented IBRs return a reserved register fault
-	// TODO: Should set Logical CPU state, not just physical
-	ia64_set_ibr(reg, val);
+	if (reg >= IA64_NUM_DBG_REGS)
+		return IA64_RSVDREG_FAULT;
+	if ((reg & 1) == 0) {
+		/* Validate address. */
+		if (val >= HYPERVISOR_VIRT_START && val <= HYPERVISOR_VIRT_END)
+			return IA64_ILLOP_FAULT;
+	} else {
+		if (!VMX_DOMAIN(vcpu)) {
+			/* Mask PL0. */
+			val &= ~(1UL << 56);
+		}
+	}
+	if (val != 0)
+		vcpu->arch.dbg_used |= (1 << (reg + IA64_NUM_DBG_REGS));
+	else
+		vcpu->arch.dbg_used &= ~(1 << (reg + IA64_NUM_DBG_REGS));
+	vcpu->arch.ibr[reg] = val;
+	if (vcpu == current)
+		ia64_set_ibr(reg, val);
 	return IA64_NO_FAULT;
 }
 
 IA64FAULT vcpu_get_dbr(VCPU * vcpu, u64 reg, u64 * pval)
 {
-	// TODO: unimplemented DBRs return a reserved register fault
-	u64 val = ia64_get_dbr(reg);
-	*pval = val;
+	if (reg >= IA64_NUM_DBG_REGS)
+		return IA64_RSVDREG_FAULT;
+	*pval = vcpu->arch.dbr[reg];
 	return IA64_NO_FAULT;
 }
 
 IA64FAULT vcpu_get_ibr(VCPU * vcpu, u64 reg, u64 * pval)
 {
-	// TODO: unimplemented IBRs return a reserved register fault
-	u64 val = ia64_get_ibr(reg);
-	*pval = val;
+	if (reg >= IA64_NUM_DBG_REGS)
+		return IA64_RSVDREG_FAULT;
+	*pval = vcpu->arch.ibr[reg];
 	return IA64_NO_FAULT;
 }
 
@@ -1973,8 +1996,8 @@ unsigned long vcpu_get_rr_ve(VCPU * vcpu
 IA64FAULT vcpu_set_rr(VCPU * vcpu, u64 reg, u64 val)
 {
 	PSCB(vcpu, rrs)[reg >> 61] = val;
-	// warning: set_one_rr() does it "live"
-	set_one_rr(reg, val);
+	if (vcpu == current)
+		set_one_rr(reg, val);
 	return IA64_NO_FAULT;
 }
 
@@ -2203,7 +2226,7 @@ IA64FAULT vcpu_itc_d(VCPU * vcpu, u64 pt
 	if (!pteval)
 		return IA64_ILLOP_FAULT;
 	if (swap_rr0)
-		set_one_rr(0x0, PSCB(vcpu, rrs[0]));
+		set_virtual_rr0();
 	vcpu_itc_no_srlz(vcpu, 2, ifa, pteval, pte, logps, &entry);
 	if (swap_rr0)
 		set_metaphysical_rr0();
@@ -2230,7 +2253,7 @@ IA64FAULT vcpu_itc_i(VCPU * vcpu, u64 pt
 	if (!pteval)
 		return IA64_ILLOP_FAULT;
 	if (swap_rr0)
-		set_one_rr(0x0, PSCB(vcpu, rrs[0]));
+		set_virtual_rr0();
 	vcpu_itc_no_srlz(vcpu, 1, ifa, pteval, pte, logps, &entry);
 	if (swap_rr0)
 		set_metaphysical_rr0();
diff -Naurp xen/arch/ia64/xen/vhpt.c xen-redhat/arch/ia64/xen/vhpt.c
--- xen/arch/ia64/xen/vhpt.c
+++ xen-redhat/arch/ia64/xen/vhpt.c
@@ -137,8 +137,8 @@ void vhpt_init(void)
 		panic("vhpt_init: bad VHPT alignment!\n");
 	__get_cpu_var(vhpt_paddr) = paddr;
 	__get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
-	printk("vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
-		paddr, __get_cpu_var(vhpt_pend));
+	printk(XENLOG_DEBUG "vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
+	       paddr, __get_cpu_var(vhpt_pend));
 	vhpt_erase(paddr);
 	// we don't enable VHPT here.
 	// context_switch() or schedule_tail() does it.
@@ -220,31 +220,20 @@ domain_purge_swtc_entries_vcpu_dirty_mas
 // (e.g. vcpu == current), smp_mb() is unnecessary.
 void vcpu_flush_vtlb_all(struct vcpu *v)
 {
-	if (VMX_DOMAIN(v)) {
-		/* This code may be call for remapping shared_info and
-		   grant_table share page from guest_physmap_remove_page()
-		   in arch_memory_op() XENMEM_add_to_physmap to realize
-		   PV-on-HVM feature. */
-		/* FIXME: This is not SMP-safe yet about p2m table */
-		/* Purge vTLB for VT-i domain */
-		thash_purge_all(v);
-	}
-	else {
-		/* First VCPU tlb.  */
-		vcpu_purge_tr_entry(&PSCBX(v,dtlb));
-		vcpu_purge_tr_entry(&PSCBX(v,itlb));
-		smp_mb();
+	/* First VCPU tlb.  */
+	vcpu_purge_tr_entry(&PSCBX(v,dtlb));
+	vcpu_purge_tr_entry(&PSCBX(v,itlb));
+	smp_mb();
 
-		/* Then VHPT.  */
-		if (HAS_PERVCPU_VHPT(v->domain))
-			vcpu_vhpt_flush(v);
-		else
-			local_vhpt_flush();
-		smp_mb();
+	/* Then VHPT.  */
+	if (HAS_PERVCPU_VHPT(v->domain))
+		vcpu_vhpt_flush(v);
+	else
+		local_vhpt_flush();
+	smp_mb();
 
-		/* Then mTLB.  */
-		local_flush_tlb_all();
-	}
+	/* Then mTLB.  */
+	local_flush_tlb_all();
 
 	/* We could clear bit in d->domain_dirty_cpumask only if domain d in
 	   not running on this processor.  There is currently no easy way to
@@ -268,6 +257,15 @@ void domain_flush_vtlb_all(struct domain
 		if (!v->is_initialised)
 			continue;
 
+		if (VMX_DOMAIN(v)) {
+			// This code may be called for remapping shared_info
+			// and grant_table from guest_physmap_remove_page()
+			// in arch_memory_op() XENMEM_add_to_physmap to realize
+			// PV-on-HVM feature.
+			vmx_vcpu_flush_vtlb_all(v);
+			continue;
+		}
+
 		if (v->processor == cpu)
 			vcpu_flush_vtlb_all(v);
 		else
diff -Naurp xen/arch/ia64/xen/xensetup.c xen-redhat/arch/ia64/xen/xensetup.c
--- xen/arch/ia64/xen/xensetup.c
+++ xen-redhat/arch/ia64/xen/xensetup.c
@@ -19,6 +19,7 @@
 #include <xen/serial.h>
 #include <xen/trace.h>
 #include <xen/keyhandler.h>
+#include <xen/vga.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
 #include <asm/setup.h>
@@ -46,7 +47,6 @@ extern long is_platform_hp_ski(void);
 extern void early_setup_arch(char **);
 extern void late_setup_arch(char **);
 extern void hpsim_serial_init(void);
-extern void alloc_dom0(void);
 extern void setup_per_cpu_areas(void);
 extern void mem_init(void);
 extern void init_IRQ(void);
@@ -81,8 +81,10 @@ boolean_param("xencons_poll", opt_xencon
  * elilo chooses 256M as alignment when relocating, alignment issue
  * on IPF can be addressed.
  */
-unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
-unsigned long xenheap_size = XENHEAP_DEFAULT_SIZE;
+static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
+integer_param("xenheap_megabytes", opt_xenheap_megabytes);
+
+unsigned long xenheap_size;
 extern long running_on_sim;
 unsigned long xen_pstart;
 void *xen_heap_start __read_mostly;
@@ -274,6 +276,20 @@ void start_kernel(void)
     }
     serial_init_preirq();
 
+#ifdef CONFIG_VGA
+    /* Plug in a default VGA mode */
+    vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
+    vga_console_info.u.text_mode_3.font_height = 16; /* generic VGA? */
+    vga_console_info.u.text_mode_3.cursor_x =
+                                        ia64_boot_param->console_info.orig_x;
+    vga_console_info.u.text_mode_3.cursor_y =
+                                        ia64_boot_param->console_info.orig_y;
+    vga_console_info.u.text_mode_3.rows =
+                                        ia64_boot_param->console_info.num_rows;
+    vga_console_info.u.text_mode_3.columns =
+                                        ia64_boot_param->console_info.num_cols;
+#endif
+
     init_console();
     set_printk_prefix("(XEN) ");
 
@@ -290,6 +306,7 @@ void start_kernel(void)
 
     printk("Xen command line: %s\n", saved_command_line);
     /* xenheap should be in same TR-covered range with xen image */
+    xenheap_size = opt_xenheap_megabytes << 20;
     xenheap_phys_end = xen_pstart + xenheap_size;
     printk("xen image pstart: 0x%lx, xenheap pend: 0x%lx\n",
            xen_pstart, xenheap_phys_end);
@@ -409,8 +426,6 @@ void start_kernel(void)
 
     trap_init();
 
-    alloc_dom0();
-
     init_xenheap_pages(__pa(xen_heap_start), xenheap_phys_end);
     printk("Xen heap: %luMB (%lukB)\n",
 	(xenheap_phys_end-__pa(xen_heap_start)) >> 20,
diff -Naurp xen/arch/powerpc/domain.c xen-redhat/arch/powerpc/domain.c
--- xen/arch/powerpc/domain.c
+++ xen-redhat/arch/powerpc/domain.c
@@ -313,13 +313,13 @@ static void relinquish_memory(struct dom
     spin_unlock_recursive(&d->page_alloc_lock);
 }
 
-void domain_relinquish_resources(struct domain *d)
+int domain_relinquish_resources(struct domain *d)
 {
     relinquish_memory(d, &d->xenpage_list);
     relinquish_memory(d, &d->page_list);
     xfree(d->arch.foreign_mfns);
     xfree(d->arch.p2m);
-    return;
+    return 0;
 }
 
 void arch_dump_domain_info(struct domain *d)
diff -Naurp xen/arch/powerpc/mm.c xen-redhat/arch/powerpc/mm.c
--- xen/arch/powerpc/mm.c
+++ xen-redhat/arch/powerpc/mm.c
@@ -338,7 +338,7 @@ uint allocate_extents(struct domain *d, 
         /* Build p2m mapping for newly allocated extent. */
         mfn = page_to_mfn(pg);
         for (i = 0; i < (1 << ext_order); i++)
-            guest_physmap_add_page(d, gpfn + i, mfn + i);
+            guest_physmap_add_page(d, gpfn + i, mfn + i, 0);
 
         /* Bump starting PFN by extent size pages. */
         gpfn += ext_nrpages;
@@ -383,7 +383,7 @@ int allocate_rma(struct domain *d, unsig
         clear_page((void *)page_to_maddr(&d->arch.rma_page[i]));
 
         /* Set up p2m mapping for RMA. */
-        guest_physmap_add_page(d, i, mfn+i);
+        guest_physmap_add_page(d, i, mfn+i, 0);
     }
 
     /* shared_info uses last page of RMA */
@@ -579,7 +579,7 @@ void guest_physmap_add_page(
 }
 
 void guest_physmap_remove_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
+    struct domain *d, unsigned long gpfn, unsigned long mfn, int order)
 {
     if (page_get_owner(mfn_to_page(mfn)) != d) {
         printk("Won't unmap foreign MFN 0x%lx for DOM%d\n", mfn, d->domain_id);
diff -Naurp xen/arch/powerpc/sysctl.c xen-redhat/arch/powerpc/sysctl.c
--- xen/arch/powerpc/sysctl.c
+++ xen-redhat/arch/powerpc/sysctl.c
@@ -45,10 +45,10 @@ long arch_do_sysctl(struct xen_sysctl *s
             cpus_weight(cpu_sibling_map[0]);
         pi->cores_per_socket =
             cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
-        pi->sockets_per_node = 
-            num_online_cpus() / cpus_weight(cpu_core_map[0]);
+        pi->sockets_per_node = num_online_cpus() /
+            (num_online_nodes() * pi->cores_per_socket * pi->threads_per_core);
 
-        pi->nr_nodes         = 1;
+        pi->nr_nodes         = num_online_nodes();
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
         pi->cpu_khz          = cpu_khz;
diff -Naurp xen/arch/x86/acpi/boot.c xen-redhat/arch/x86/acpi/boot.c
--- xen/arch/x86/acpi/boot.c
+++ xen-redhat/arch/x86/acpi/boot.c
@@ -36,6 +36,7 @@
 #include <asm/apic.h>
 #include <asm/io.h>
 #include <asm/mpspec.h>
+#include <asm/processor.h>
 #include <mach_apic.h>
 #include <mach_mpparse.h>
 
@@ -918,5 +919,21 @@ int __init acpi_boot_init(void)
 
 	acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
 
+	acpi_dmar_init();
+
 	return 0;
 }
+
+unsigned int acpi_get_processor_id(unsigned int cpu)
+{
+	unsigned int acpiid, apicid;
+
+	if ((apicid = x86_cpu_to_apicid[cpu]) == 0xff)
+		return 0xff;
+
+	for (acpiid = 0; acpiid < ARRAY_SIZE(x86_acpiid_to_apicid); acpiid++)
+		if (x86_acpiid_to_apicid[acpiid] == apicid)
+			return acpiid;
+
+	return 0xff;
+}
diff -Naurp xen/arch/x86/apic.c xen-redhat/arch/x86/apic.c
--- xen/arch/x86/apic.c
+++ xen-redhat/arch/x86/apic.c
@@ -40,7 +40,7 @@
 /*
  * Knob to control our willingness to enable the local APIC.
  */
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
 
 /*
  * Debug level
@@ -704,7 +704,7 @@ static void apic_pm_activate(void)
 static void __init lapic_disable(char *str)
 {
     enable_local_apic = -1;
-    clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+    setup_clear_cpu_cap(X86_FEATURE_APIC);
 }
 custom_param("nolapic", lapic_disable);
 
@@ -737,7 +737,7 @@ static int __init detect_init_APIC (void
     switch (boot_cpu_data.x86_vendor) {
     case X86_VENDOR_AMD:
         if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
-            (boot_cpu_data.x86 == 15))        
+            (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x17))
             break;
         goto no_apic;
     case X86_VENDOR_INTEL:
diff -Naurp xen/arch/x86/boot/head.S xen-redhat/arch/x86/boot/head.S
--- xen/arch/x86/boot/head.S
+++ xen-redhat/arch/x86/boot/head.S
@@ -98,6 +98,7 @@ __start:
         mov     $0x80000001,%eax
         cpuid
 1:      mov     %edx,sym_phys(cpuid_ext_features)
+        mov     %edx,sym_phys(boot_cpu_data)+CPUINFO_ext_features
 
 #if defined(__x86_64__)
         /* Check for availability of long mode. */
diff -Naurp xen/arch/x86/boot/x86_32.S xen-redhat/arch/x86/boot/x86_32.S
--- xen/arch/x86/boot/x86_32.S
+++ xen-redhat/arch/x86/boot/x86_32.S
@@ -78,7 +78,7 @@ idt_descr:
         .word   0
 gdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .long   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .long   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
         .align PAGE_SIZE, 0
 /* NB. Rings != 0 get access up to MACH2PHYS_VIRT_END. This allows access to */
@@ -86,16 +86,17 @@ gdt_descr:
 #define GUEST_DESC(d)                                                   \
         .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff,                \
               ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d)
-ENTRY(gdt_table)
-        .quad 0x0000000000000000     /* unused */
+ENTRY(boot_cpu_gdt_table)
+        .quad 0x0000000000000000     /* double fault TSS */
         .quad 0x00cf9a000000ffff     /* 0xe008 ring 0 4.00GB code at 0x0 */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 4.00GB data at 0x0 */
         GUEST_DESC(0x00c0ba00)       /* 0xe019 ring 1 3.xxGB code at 0x0 */
         GUEST_DESC(0x00c0b200)       /* 0xe021 ring 1 3.xxGB data at 0x0 */
         GUEST_DESC(0x00c0fa00)       /* 0xe02b ring 3 3.xxGB code at 0x0 */
         GUEST_DESC(0x00c0f200)       /* 0xe033 ring 3 3.xxGB data at 0x0 */
-        .quad 0x0000000000000000     /* unused                           */
-        .fill 2*NR_CPUS,8,0          /* space for TSS and LDT per CPU    */
+        .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu) */
+        .align PAGE_SIZE,0
 
 #ifdef CONFIG_X86_PAE
         .align 32
diff -Naurp xen/arch/x86/boot/x86_64.S xen-redhat/arch/x86/boot/x86_64.S
--- xen/arch/x86/boot/x86_64.S
+++ xen-redhat/arch/x86/boot/x86_64.S
@@ -85,7 +85,7 @@ multiboot_ptr:
         .word   0
 gdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
         .word   0,0,0
 idt_descr:
@@ -96,7 +96,7 @@ ENTRY(stack_start)
         .quad   cpu0_stack
 
         .align PAGE_SIZE, 0
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
@@ -105,13 +105,13 @@ ENTRY(gdt_table)
         .quad 0x00cff2000000ffff     /* 0xe02b ring 3 data                */
         .quad 0x00affa000000ffff     /* 0xe033 ring 3 code, 64-bit mode   */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
-        .org gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8
-        .fill 4*NR_CPUS,8,0          /* space for TSS and LDT per CPU     */
+        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
 
         .align PAGE_SIZE, 0
 /* NB. Even rings != 0 get access to the full 4Gb, as only the            */
 /*     (compatibility) machine->physical mapping table lives there.       */
-ENTRY(compat_gdt_table)
+ENTRY(boot_cpu_compat_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
@@ -120,5 +120,6 @@ ENTRY(compat_gdt_table)
         .quad 0x00cffa000000ffff     /* 0xe02b ring 3 code, compatibility */
         .quad 0x00cff2000000ffff     /* 0xe033 ring 3 data                */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
-        .org compat_gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8
-        .fill 4*NR_CPUS,8,0          /* space for TSS and LDT per CPU     */
+        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
+        .align PAGE_SIZE, 0
diff -Naurp xen/arch/x86/cpu/amd.c xen-redhat/arch/x86/cpu/amd.c
--- xen/arch/x86/cpu/amd.c
+++ xen-redhat/arch/x86/cpu/amd.c
@@ -3,6 +3,7 @@
 #include <xen/bitops.h>
 #include <xen/mm.h>
 #include <xen/smp.h>
+#include <xen/pci.h>
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
@@ -66,19 +67,6 @@ static int c1_ramping_may_cause_clock_dr
 	return 1;
 }
 
-/* PCI access functions. Should be safe to use 0xcf8/0xcfc port accesses here. */
-static u8 pci_read_byte(u32 bus, u32 dev, u32 fn, u32 reg)
-{
-	outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8);
-	return inb(0xcfc + (reg & 3));
-}
-
-static void pci_write_byte(u32 bus, u32 dev, u32 fn, u32 reg, u8 val)
-{
-	outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8);
-	outb(val, 0xcfc + (reg & 3));
-}
-
 /*
  * Disable C1-Clock ramping if enabled in PMM7.CpuLowPwrEnh on 8th-generation
  * cores only. Assume BIOS has setup all Northbridges equivalently.
@@ -86,18 +74,20 @@ static void pci_write_byte(u32 bus, u32 
 static void disable_c1_ramping(void) 
 {
 	u8 pmm7;
-	int node;
+	int node, nr_nodes;
 
-	for (node=0; node < NR_CPUS; node++) {
-		/* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */
-		pmm7 = pci_read_byte(0, 0x18+node, 0x3, 0x87);
-		/* Invalid read means we've updated every Northbridge. */
-		if (pmm7 == 0xFF)
-			break;
-		pmm7 &= 0xFC; /* clear pmm7[1:0] */
-		pci_write_byte(0, 0x18+node, 0x3, 0x87, pmm7);
-		printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node);
-	}
+    /* Read the number of nodes from the first Northbridge. */
+    nr_nodes = ((pci_conf_read32(0, 0x18, 0x0, 0x60)>>4)&0x07)+1;
+    for (node = 0; node < nr_nodes; node++) {
+        /* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */
+        pmm7 = pci_conf_read8(0, 0x18+node, 0x3, 0x87);
+        /* Invalid read means we've updated every Northbridge. */
+        if (pmm7 == 0xFF)
+            break;
+        pmm7 &= 0xFC; /* clear pmm7[1:0] */
+        pci_conf_write8(0, 0x18+node, 0x3, 0x87, pmm7);
+        printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node);
+    }
 }
 
 static void __init init_amd(struct cpuinfo_x86 *c)
@@ -278,7 +268,7 @@ static void __init init_amd(struct cpuin
 	}
 
 	switch (c->x86) {
-	case 15:
+	case 0xf ... 0x17:
 		set_bit(X86_FEATURE_K8, c->x86_capability);
 		break;
 	case 6:
@@ -303,11 +293,8 @@ static void __init init_amd(struct cpuin
 
 	display_cacheinfo(c);
 
-	if (cpuid_eax(0x80000000) >= 0x80000008) {
+	if (cpuid_eax(0x80000000) >= 0x80000008)
 		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-		if (c->x86_max_cores & (c->x86_max_cores - 1))
-			c->x86_max_cores = 1;
-	}
 
 	if (cpuid_eax(0x80000000) >= 0x80000007) {
 		c->x86_power = cpuid_edx(0x80000007);
@@ -317,15 +304,18 @@ static void __init init_amd(struct cpuin
 
 #ifdef CONFIG_X86_HT
 	/*
-	 * On a AMD dual core setup the lower bits of the APIC id
+	 * On a AMD multi core setup the lower bits of the APIC id
 	 * distingush the cores.  Assumes number of cores is a power
 	 * of two.
 	 */
 	if (c->x86_max_cores > 1) {
 		int cpu = smp_processor_id();
-		unsigned bits = 0;
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
+		unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
+
+		if (bits == 0) {
+			while ((1 << bits) < c->x86_max_cores)
+				bits++;
+		}
 		cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1);
 		phys_proc_id[cpu] >>= bits;
 		printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
diff -Naurp xen/arch/x86/cpu/common.c xen-redhat/arch/x86/cpu/common.c
--- xen/arch/x86/cpu/common.c
+++ xen-redhat/arch/x86/cpu/common.c
@@ -23,6 +23,20 @@ static int disable_x86_serial_nr __devin
 
 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
 
+/*
+ * Default host IA32_CR_PAT value to cover all memory types.
+ * BIOS usually sets it to 0x07040600070406.
+ */
+u64 host_pat = 0x050100070406;
+
+static unsigned int __cpuinitdata cleared_caps[NCAPINTS];
+
+void __init setup_clear_cpu_cap(unsigned int cap)
+{
+	__clear_bit(cap, boot_cpu_data.x86_capability);
+	__set_bit(cap, cleared_caps);
+}
+
 static void default_init(struct cpuinfo_x86 * c)
 {
 	/* Not much we can do here... */
@@ -220,6 +234,7 @@ static void __init early_cpu_detect(void
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xF) << 4;
 		c->x86_mask = tfms & 15;
+		cap0 &= ~cleared_caps[0];
 		if (cap0 & (1<<19))
 			c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 		c->x86_capability[0] = cap0; /* Added for Xen bootstrap */
@@ -250,10 +265,10 @@ void __devinit generic_identify(struct c
 			c->x86_capability[4] = excap;
 			c->x86 = (tfms >> 8) & 15;
 			c->x86_model = (tfms >> 4) & 15;
-			if (c->x86 == 0xf) {
+			if (c->x86 == 0xf)
 				c->x86 += (tfms >> 20) & 0xff;
+			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
-			} 
 			c->x86_mask = tfms & 15;
 		} else {
 			/* Have CPUID level 0 only - unheard of */
@@ -378,6 +393,9 @@ void __devinit identify_cpu(struct cpuin
 	if (disable_pse)
 		clear_bit(X86_FEATURE_PSE, c->x86_capability);
 
+	for (i = 0 ; i < NCAPINTS ; ++i)
+		c->x86_capability[i] &= ~cleared_caps[i];
+
 	/* If the model name is still unset, do table lookup. */
 	if ( !c->x86_model_id[0] ) {
 		char *p;
@@ -422,8 +440,6 @@ void __devinit identify_cpu(struct cpuin
 
 	if (c == &boot_cpu_data)
 		mtrr_bp_init();
-	else
-		mtrr_ap_init();
 }
 
 #ifdef CONFIG_X86_HT
@@ -549,7 +565,10 @@ void __devinit cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	struct tss_struct *t = &init_tss[cpu];
-	char gdt_load[10];
+	struct desc_ptr gdt_desc = {
+		.base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
+		.limit = LAST_RESERVED_GDT_BYTE
+	};
 
 	if (cpu_test_and_set(cpu, cpu_initialized)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -557,12 +576,16 @@ void __devinit cpu_init(void)
 	}
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
 
+    if (cpu_has_pat)
+        wrmsrl(MSR_IA32_CR_PAT, host_pat);
+
 	if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
 		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
-	*(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
-	*(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(current);
-	__asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
+	/* Install correct page table. */
+	write_ptbase(current);
+
+	__asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_desc) );
 
 	/* No nested task. */
 	__asm__("pushf ; andw $0xbfff,(%"__OP"sp) ; popf");
@@ -590,7 +613,4 @@ void __devinit cpu_init(void)
 #define CD(register) __asm__("mov %0,%%db" #register ::"r"(0UL) );
 	CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
 #undef CD
-
-	/* Install correct page table. */
-	write_ptbase(current);
 }
diff -Naurp xen/arch/x86/cpu/intel.c xen-redhat/arch/x86/cpu/intel.c
--- xen/arch/x86/cpu/intel.c
+++ xen-redhat/arch/x86/cpu/intel.c
@@ -118,6 +118,12 @@ static void __devinit init_intel(struct 
 
 	select_idle_routine(c);
 	l2 = init_intel_cacheinfo(c);
+	if (c->cpuid_level > 9) {
+		unsigned eax = cpuid_eax(10);
+		/* Check for version and the number of counters */
+		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+			set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
+	}
 
 	/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
 	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
diff -Naurp xen/arch/x86/cpu/mcheck/k7.c xen-redhat/arch/x86/cpu/mcheck/k7.c
--- xen/arch/x86/cpu/mcheck/k7.c
+++ xen-redhat/arch/x86/cpu/mcheck/k7.c
@@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 
 	machine_check_vector = k7_machine_check;
 	wmb();
 
+	if (!cpu_has(c, X86_FEATURE_MCE))
+		return;
+
 	printk (KERN_INFO "Intel machine check architecture supported.\n");
 	rdmsr (MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
diff -Naurp xen/arch/x86/cpu/mcheck/mce.c xen-redhat/arch/x86/cpu/mcheck/mce.c
--- xen/arch/x86/cpu/mcheck/mce.c
+++ xen-redhat/arch/x86/cpu/mcheck/mce.c
@@ -34,8 +34,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
 
 	switch (c->x86_vendor) {
 		case X86_VENDOR_AMD:
-			if (c->x86==6 || c->x86==15)
-				amd_mcheck_init(c);
+			amd_mcheck_init(c);
 			break;
 
 		case X86_VENDOR_INTEL:
diff -Naurp xen/arch/x86/cpu/mtrr/generic.c xen-redhat/arch/x86/cpu/mtrr/generic.c
--- xen/arch/x86/cpu/mtrr/generic.c
+++ xen-redhat/arch/x86/cpu/mtrr/generic.c
@@ -202,7 +202,9 @@ static int set_mtrr_var_ranges(unsigned 
 	return changed;
 }
 
-static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
+static u32 deftype_lo, deftype_hi;
+
+static unsigned long set_mtrr_state(void)
 /*  [SUMMARY] Set the MTRR state for this CPU.
     <state> The MTRR state information to read.
     <ctxt> Some relevant CPU context.
@@ -233,7 +235,6 @@ static unsigned long set_mtrr_state(u32 
 
 
 static unsigned long cr4 = 0;
-static u32 deftype_lo, deftype_hi;
 static DEFINE_SPINLOCK(set_atomicity_lock);
 
 /*
@@ -300,7 +301,7 @@ static void generic_set_all(void)
 	prepare_set();
 
 	/* Actually set the state */
-	mask = set_mtrr_state(deftype_lo,deftype_hi);
+	mask = set_mtrr_state();
 
 	post_set();
 	local_irq_restore(flags);
diff -Naurp xen/arch/x86/cpu/mtrr/main.c xen-redhat/arch/x86/cpu/mtrr/main.c
--- xen/arch/x86/cpu/mtrr/main.c
+++ xen-redhat/arch/x86/cpu/mtrr/main.c
@@ -55,7 +55,7 @@ u32 num_var_ranges = 0;
 unsigned int *usage_table;
 static DECLARE_MUTEX(mtrr_sem);
 
-u32 size_or_mask, size_and_mask;
+u64 size_or_mask, size_and_mask;
 
 static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
 
@@ -134,6 +134,17 @@ struct set_mtrr_data {
 	mtrr_type	smp_type;
 };
 
+/* As per the IA32 SDM vol-3: 10.11.8 MTRR Considerations in MP Systems section
+ * MTRRs updates must to be synchronized across all the processors.
+ * This flags avoids multiple cpu synchronization while booting each cpu.
+ * At the boot & resume time, this flag is turned on in mtrr_aps_sync_begin().
+ * Using this flag the mtrr initialization (and the all cpus sync up) in the 
+ * mtrr_ap_init() is avoided while booting each cpu. 
+ * After all the cpus have came up, then mtrr_aps_sync_end() synchronizes all 
+ * the cpus and updates mtrrs on all of them. Then this flag is turned off.
+ */
+int hold_mtrr_updates_on_aps;
+
 #ifdef CONFIG_SMP
 
 static void ipi_handler(void *info)
@@ -151,11 +162,13 @@ static void ipi_handler(void *info)
 		cpu_relax();
 
 	/*  The master has cleared me to execute  */
-	if (data->smp_reg != ~0U) 
+	if (data->smp_reg == ~0U) /* update all mtrr registers */
+		/* At the cpu hot-add time this will reinitialize mtrr 
+ 		 * registres on the existing cpus. It is ok.  */
+		mtrr_if->set_all();
+	else /* single mtrr register update */
 		mtrr_if->set(data->smp_reg, data->smp_base, 
 			     data->smp_size, data->smp_type);
-	else
-		mtrr_if->set_all();
 
 	atomic_dec(&data->count);
 	while(atomic_read(&data->gate))
@@ -240,7 +253,11 @@ static void set_mtrr(unsigned int reg, u
 	 * to replicate across all the APs. 
 	 * If we're doing that @reg is set to something special...
 	 */
-	if (reg != ~0U) 
+	if (reg == ~0U)  /* update all mtrr registers */
+		/* at boot or resume time, this will reinitialize the mtrrs on 
+		 * the bp. It is ok. */
+		mtrr_if->set_all();
+	else /* update the single mtrr register */
 		mtrr_if->set(reg,base,size,type);
 
 	/* wait for the others */
@@ -589,8 +606,8 @@ void __init mtrr_bp_init(void)
 			     boot_cpu_data.x86_mask == 0x4))
 				phys_addr = 36;
 
-			size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
-			size_and_mask = ~size_or_mask & 0xfff00000;
+			size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
+			size_and_mask = ~size_or_mask & 0xfffff00000ULL;
 		} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
 			   boot_cpu_data.x86 == 6) {
 			/* VIA C* family have Intel style MTRRs, but
@@ -639,9 +656,7 @@ void __init mtrr_bp_init(void)
 
 void mtrr_ap_init(void)
 {
-	unsigned long flags;
-
-	if (!mtrr_if || !use_intel())
+	if (!mtrr_if || !use_intel() || hold_mtrr_updates_on_aps)
 		return;
 	/*
 	 * Ideally we should hold mtrr_sem here to avoid mtrr entries changed,
@@ -651,11 +666,22 @@ void mtrr_ap_init(void)
 	 * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to
 	 * prevent mtrr entry changes
 	 */
-	local_irq_save(flags);
+	set_mtrr(~0U, 0, 0, 0);
+}
 
-	mtrr_if->set_all();
+void mtrr_aps_sync_begin(void)
+{
+	if (!use_intel())
+		return;
+	hold_mtrr_updates_on_aps = 1;
+}
 
-	local_irq_restore(flags);
+void mtrr_aps_sync_end(void)
+{
+	if (!use_intel())
+		return;
+	set_mtrr(~0U, 0, 0, 0);
+	hold_mtrr_updates_on_aps = 0;
 }
 
 static int __init mtrr_init_finialize(void)
diff -Naurp xen/arch/x86/cpu/mtrr/mtrr.h xen-redhat/arch/x86/cpu/mtrr/mtrr.h
--- xen/arch/x86/cpu/mtrr/mtrr.h
+++ xen-redhat/arch/x86/cpu/mtrr/mtrr.h
@@ -83,7 +83,7 @@ void get_mtrr_state(void);
 
 extern void set_mtrr_ops(struct mtrr_ops * ops);
 
-extern u32 size_or_mask, size_and_mask;
+extern u64 size_or_mask, size_and_mask;
 extern struct mtrr_ops * mtrr_if;
 
 #define is_cpu(vnd)	(mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
diff -Naurp xen/arch/x86/crash.c xen-redhat/arch/x86/crash.c
--- xen/arch/x86/crash.c
+++ xen-redhat/arch/x86/crash.c
@@ -102,6 +102,7 @@ void machine_crash_shutdown(void)
     hvm_disable();
 
     info = kexec_crash_save_info();
+    info->xen_phys_start = xen_phys_start;
     info->dom0_pfn_to_mfn_frame_list_list =
         arch_get_pfn_to_mfn_frame_list_list(dom0);
 }
diff -Naurp xen/arch/x86/debug.c xen-redhat/arch/x86/debug.c
--- xen/arch/x86/debug.c
+++ xen-redhat/arch/x86/debug.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2009, Mukesh Rathor, Oracle Corp.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/compile.h>
+#include <xen/mm.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <asm/p2m.h>
+
+/*
+ * This file for general routines common to more than one debugger, like kdb,
+ * gdbsx, etc..
+ */
+
+#ifdef DBGDEBUG
+#define DBGP1(...) {(DBGDEBUG)     ? printk(__VA_ARGS__) : 0;}
+#define DBGP2(...) {(DBGDEBUG > 1) ? printk(__VA_ARGS__) : 0;}
+#else
+#define DBGP1(...) {0;}
+#define DBGP2(...) {0;}
+#endif
+
+typedef unsigned long dbgva_t;
+typedef unsigned char dbgbyte_t;
+
+/* Returns: mfn for the given (hvm guest) vaddr */
+static unsigned long
+dbg_hvm_va2mfn(dbgva_t vaddr, struct domain *dp, int toaddr)
+{
+    unsigned long mfn, gfn;
+
+    DBGP2("vaddr:%lx domid:%d\n", vaddr, dp->domain_id);
+
+    gfn = paging_gva_to_gfn(dp->vcpu[0], vaddr);
+    if ( gfn == INVALID_GFN )
+    {
+        DBGP2("kdb:bad gfn from gva_to_gfn\n");
+        return INVALID_MFN;
+    }
+
+    mfn = mfn_x(gfn_to_mfn(dp, gfn));
+
+    DBGP2("X: vaddr:%lx domid:%d mfn:%lx\n", vaddr, dp->domain_id, mfn);
+    return mfn;
+}
+
+#if defined(__x86_64__)
+
+/*
+ * pgd3val: this is the value of init_mm.pgd[3] in a PV guest. It is optional.
+ *          This to assist debug of modules in the guest. The kernel address
+ *          space seems is always mapped, but modules are not necessarily
+ *          mapped in any arbitraty guest cr3 that we pick if pgd3val is 0.
+ *          Modules should always be addressible if we use cr3 from init_mm.
+ *          Since pgd3val is already a pgd value, cr3->pgd[3], we just need to
+ *          do 2 level lookups.
+ *
+ * NOTE: 4 level paging works for 32 PAE guests also because cpu runs in IA32-e
+ *       mode.
+ * Returns: mfn for the given (pv guest) vaddr
+ */
+static unsigned long
+dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
+{
+    l4_pgentry_t l4e, *l4t;
+    l3_pgentry_t l3e, *l3t;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3);
+    unsigned long mfn = cr3 >> PAGE_SHIFT;
+
+    DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id,
+          cr3, pgd3val);
+
+    if ( pgd3val == 0 )
+    {
+        l4t = mfn_to_virt(mfn);
+        l4e = l4t[l4_table_offset(vaddr)];
+        mfn = l4e_get_pfn(l4e);
+        DBGP2("l4t:%p l4to:%lx l4e:%"PRIpte" mfn:%lx\n", l4t,
+              l4_table_offset(vaddr), l4e_get_intpte(l4e), mfn);
+        if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+        {
+            DBGP1("l4 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3);
+            return INVALID_MFN;
+        }
+
+        l3t = mfn_to_virt(mfn);
+        l3e = l3t[l3_table_offset(vaddr)];
+        mfn = l3e_get_pfn(l3e);
+        DBGP2("l3t:%p l3to:%lx l3e:%"PRIpte" mfn:%lx\n", l3t,
+              l3_table_offset(vaddr), l3e_get_intpte(l3e), mfn);
+        if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        {
+            DBGP1("l3 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3);
+            return INVALID_MFN;
+        }
+    }
+
+    l2t = mfn_to_virt(mfn);
+    l2e = l2t[l2_table_offset(vaddr)];
+    mfn = l2e_get_pfn(l2e);
+    DBGP2("l2t:%p l2to:%lx l2e:%"PRIpte" mfn:%lx\n", l2t, l2_table_offset(vaddr),
+          l2e_get_intpte(l2e), mfn);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+    {
+        DBGP1("l2 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3);
+        return INVALID_MFN;
+    }
+    l1t = mfn_to_virt(mfn);
+    l1e = l1t[l1_table_offset(vaddr)];
+    mfn = l1e_get_pfn(l1e);
+    DBGP2("l1t:%p l1to:%lx l1e:%"PRIpte" mfn:%lx\n", l1t, l1_table_offset(vaddr),
+          l1e_get_intpte(l1e), mfn);
+
+    return mfn_valid(mfn) ? mfn : INVALID_MFN;
+}
+
+#else
+
+/* Returns: mfn for the given (pv guest) vaddr */
+static unsigned long
+dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
+{
+#if CONFIG_PAGING_LEVELS >= 3
+    l3_pgentry_t l3e, *l3t;
+#endif
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3);
+    unsigned long mfn = cr3 >> PAGE_SHIFT;
+
+    DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id,
+          cr3, pgd3val);
+
+#if CONFIG_PAGING_LEVELS >= 3
+    if ( pgd3val == 0 )
+    {
+        l3t  = map_domain_page(mfn);
+        l3t += (cr3 & 0xFE0UL) >> 3;
+        l3e = l3t[l3_table_offset(vaddr)];
+        mfn = l3e_get_pfn(l3e);
+        unmap_domain_page(l3t);
+        if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+            return INVALID_MFN;
+    }
+#endif
+
+    l2t = map_domain_page(mfn);
+    l2e = l2t[l2_table_offset(vaddr)];
+    mfn = l2e_get_pfn(l2e);
+    unmap_domain_page(l2t);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return INVALID_MFN;
+
+    l1t = map_domain_page(mfn);
+    l1e = l1t[l1_table_offset(vaddr)];
+    mfn = l1e_get_pfn(l1e);
+    unmap_domain_page(l1t);
+
+    return mfn_valid(mfn) ? mfn : INVALID_MFN;
+}
+#endif  /* defined(__x86_64__) */
+
+/* Returns: number of bytes remaining to be copied */
+static int
+dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp,
+                 int toaddr, uint64_t pgd3)
+{
+    while ( len > 0 )
+    {
+        char *va;
+        unsigned long mfn, pagecnt;
+
+        pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len);
+
+        mfn = (dp->is_hvm
+               ? dbg_hvm_va2mfn(addr, dp, toaddr)
+               : dbg_pv_va2mfn(addr, dp, pgd3));
+        if ( mfn == INVALID_MFN )
+            break;
+
+        va = map_domain_page(mfn);
+        va = va + (addr & (PAGE_SIZE-1));
+
+        if ( toaddr )
+        {
+            memcpy(va, buf, pagecnt);    /* va = buf */
+            paging_mark_dirty(dp, mfn);
+        }
+        else
+        {
+            memcpy(buf, va, pagecnt);    /* buf = va */
+        }
+
+        unmap_domain_page(va);
+
+        addr += pagecnt;
+        buf += pagecnt;
+        len -= pagecnt;
+    }
+
+    return len;
+}
+
+/*
+ * addr is hypervisor addr if domid == IDLE_DOMAIN_ID, else it's guest addr
+ * buf is debugger buffer.
+ * if toaddr, then addr = buf (write to addr), else buf = addr (rd from guest)
+ * pgd3: value of init_mm.pgd[3] in guest. see above.
+ * Returns: number of bytes remaining to be copied.
+ */
+int
+dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len, domid_t domid, int toaddr,
+           uint64_t pgd3)
+{
+    struct domain *dp = get_domain_by_id(domid);
+    int hyp = (domid == IDLE_DOMAIN_ID);
+
+    DBGP2("gmem:addr:%lx buf:%p len:$%d domid:%x toaddr:%x dp:%p\n",
+          addr, buf, len, domid, toaddr, dp);
+    if ( hyp )
+    {
+        if ( toaddr )
+            len = __copy_to_user((void *)addr, buf, len);
+        else
+            len = __copy_from_user(buf, (void *)addr, len);
+    }
+    else if ( dp )
+    {
+        if ( !dp->is_dying )   /* make sure guest is still there */
+            len= dbg_rw_guest_mem(addr, buf, len, dp, toaddr, pgd3);
+        put_domain(dp);
+    }
+
+    DBGP2("gmem:exit:len:$%d\n", len);
+    return len;
+}
diff -Naurp xen/arch/x86/dmi_scan.c xen-redhat/arch/x86/dmi_scan.c
--- xen/arch/x86/dmi_scan.c
+++ xen-redhat/arch/x86/dmi_scan.c
@@ -102,23 +102,32 @@ inline static int __init dmi_checksum(u8
 	return (sum==0);
 }
 
+int __init dmi_get_table(u32 *base, u32 *len)
+{
+	u8 buf[15];
+	char __iomem *p, *q;
+
+	p = maddr_to_virt(0xF0000);
+	for (q = p; q < p + 0x10000; q += 16) {
+		memcpy_fromio(buf, q, 15);
+		if (memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) {
+			*base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8];
+			*len=buf[7]<<8|buf[6];
+			return 0;
+		}
+	}
+	return -1;
+}
+
 static int __init dmi_iterate(void (*decode)(struct dmi_header *))
 {
 	u8 buf[15];
 	char __iomem *p, *q;
 
-	/*
-	 * no iounmap() for that ioremap(); it would be a no-op, but it's
-	 * so early in setup that sucker gets confused into doing what
-	 * it shouldn't if we actually call it.
-	 */
-	p = ioremap(0xF0000, 0x10000);
-	if (p == NULL)
-		return -1;
+	p = maddr_to_virt(0xF0000);
 	for (q = p; q < p + 0x10000; q += 16) {
 		memcpy_fromio(buf, q, 15);
-		if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf))
-		{
+		if (memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) {
 			u16 num=buf[13]<<8|buf[12];
 			u16 len=buf[7]<<8|buf[6];
 			u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8];
diff -Naurp xen/arch/x86/domain_build.c xen-redhat/arch/x86/domain_build.c
--- xen/arch/x86/domain_build.c
+++ xen-redhat/arch/x86/domain_build.c
@@ -35,7 +35,13 @@ extern void discard_initial_images(void)
 
 static long __initdata dom0_nrpages;
 static long __initdata dom0_min_nrpages;
-static long __initdata dom0_max_nrpages = LONG_MAX;
+/*
+ * Limit dom0 memory allocation to 32GB.  This should be large
+ * enough for anything, yet small enough that on the very largest
+ * NUMA systems we do not waste too much memory on dom0 bookkeeping
+ * and keep dom0 memory mostly on one node.
+ */
+static long __initdata dom0_max_nrpages = 32L << (30 - PAGE_SHIFT);
 
 /*
  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
@@ -132,13 +138,16 @@ static unsigned long __init compute_dom0
     /*
      * If domain 0 allocation isn't specified, reserve 1/16th of available
      * memory for things like DMA buffers. This reservation is clamped to 
-     * a maximum of 128MB.
+     * a maximum of 384MB.
      */
     if ( dom0_nrpages == 0 )
     {
         dom0_nrpages = avail;
-        dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
+        dom0_nrpages = min(dom0_nrpages / 8, 384L << (20 - PAGE_SHIFT));
         dom0_nrpages = -dom0_nrpages;
+    } else {
+        /* User specified a dom0_size.  Do not clamp the maximum. */
+        dom0_max_nrpages = LONG_MAX;
     }
 
     /* Negative memory specification means "all memory - specified amount". */
@@ -326,24 +335,11 @@ int __init construct_dom0(
 #ifdef CONFIG_COMPAT
     if ( compat32 )
     {
-        l1_pgentry_t gdt_l1e;
-
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
         v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
 
         if ( nr_pages != (unsigned int)nr_pages )
             nr_pages = UINT_MAX;
-
-        /*
-         * Map compatibility Xen segments into every VCPU's GDT. See
-         * arch_domain_create() for further comments.
-         */
-        gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
-                                PAGE_HYPERVISOR);
-        for ( i = 0; i < MAX_VIRT_CPUS; i++ )
-            d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
-                                     FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-        local_flush_tlb_one(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
     }
 #endif
     if ( parms.pae == PAEKERN_extended_cr3 )
@@ -363,9 +359,9 @@ int __init construct_dom0(
 #ifdef CONFIG_COMPAT
         HYPERVISOR_COMPAT_VIRT_START(d) =
             max_t(unsigned int, m2p_compat_vstart, value);
-        d->arch.physaddr_bitsize = !is_pv_32on64_domain(d) ? 64 :
-            fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
-            + (PAGE_SIZE - 2);
+
+        domain_set_alloc_bitsize(d);
+
         if ( value > (!is_pv_32on64_domain(d) ?
                       HYPERVISOR_VIRT_START :
                       __HYPERVISOR_COMPAT_VIRT_START) )
@@ -772,16 +768,22 @@ int __init construct_dom0(
 
     if ( opt_dom0_max_vcpus == 0 )
         opt_dom0_max_vcpus = num_online_cpus();
-    if ( opt_dom0_max_vcpus > num_online_cpus() )
-        opt_dom0_max_vcpus = num_online_cpus();
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
-    if ( opt_dom0_max_vcpus > BITS_PER_GUEST_LONG(d) )
-        opt_dom0_max_vcpus = BITS_PER_GUEST_LONG(d);
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
+    /*
+     * If dom0 has fewer VCPUs than there are physical CPUs on the system,
+     * we need to disable cpu frequency scaling.
+     */
+    if ( opt_dom0_max_vcpus != num_online_cpus() ) {
+        extern unsigned int opt_dom0_vcpus_pin;
+        cpufreq_controller = FREQCTL_none;
+        opt_dom0_vcpus_pin = 0;
+    }
+
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i);
+        (void)alloc_vcpu(d, i, i % num_online_cpus());
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(v->domain) )
@@ -966,6 +968,8 @@ int __init construct_dom0(
     rc |= ioports_deny_access(dom0, 0x40, 0x43);
     /* PIT Channel 2 / PC Speaker Control. */
     rc |= ioports_deny_access(dom0, 0x61, 0x61);
+    /* PCI configuration space (NB. 0xcf8 has special treatment). */
+    rc |= ioports_deny_access(dom0, 0xcfc, 0xcff);
     /* Command-line I/O ranges. */
     process_dom0_ioports_disable();
 
diff -Naurp xen/arch/x86/domain.c xen-redhat/arch/x86/domain.c
--- xen/arch/x86/domain.c
+++ xen-redhat/arch/x86/domain.c
@@ -29,6 +29,7 @@
 #include <xen/console.h>
 #include <xen/percpu.h>
 #include <xen/compat.h>
+#include <xen/acpi.h>
 #include <asm/regs.h>
 #include <asm/mc146818rtc.h>
 #include <asm/system.h>
@@ -43,14 +44,14 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/msr.h>
+#include <xen/iommu.h>
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
 #endif
 
 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
-DEFINE_PER_CPU(__u64, efer);
-
-static void unmap_vcpu_info(struct vcpu *v);
+DEFINE_PER_CPU(u64, efer);
+DEFINE_PER_CPU(unsigned long, cr4);
 
 static void paravirt_ctxt_switch_from(struct vcpu *v);
 static void paravirt_ctxt_switch_to(struct vcpu *v);
@@ -80,7 +81,6 @@ void idle_loop(void)
 {
     for ( ; ; )
     {
-        page_scrub_schedule_work();
         default_idle();
         do_softirq();
     }
@@ -266,6 +266,18 @@ static void release_compat_l4(struct vcp
     v->arch.guest_table_user = pagetable_null();
 }
 
+void domain_set_alloc_bitsize(struct domain *d)
+{
+    if ( !is_pv_32on64_domain(d) ||
+         (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) )
+        return;
+    d->arch.physaddr_bitsize =
+        /* 2^n entries can be contained in guest's p2m mapping space */
+        fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
+        /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
+        + PAGE_SHIFT;
+}
+
 static inline int may_switch_mode(struct domain *d)
 {
     return (!is_hvm_domain(d) && (d->tot_pages == 0));
@@ -273,7 +285,6 @@ static inline int may_switch_mode(struct
 
 int switch_native(struct domain *d)
 {
-    l1_pgentry_t gdt_l1e;
     unsigned int vcpuid;
 
     if ( d == NULL )
@@ -286,24 +297,17 @@ int switch_native(struct domain *d)
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
     release_arg_xlat_area(d);
 
-    /* switch gdt */
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
     for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
     {
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
         if (d->vcpu[vcpuid])
             release_compat_l4(d->vcpu[vcpuid]);
     }
 
-    d->arch.physaddr_bitsize = 64;
-
     return 0;
 }
 
 int switch_compat(struct domain *d)
 {
-    l1_pgentry_t gdt_l1e;
     unsigned int vcpuid;
 
     if ( d == NULL )
@@ -315,33 +319,23 @@ int switch_compat(struct domain *d)
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
 
-    /* switch gdt */
-    gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
     for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
     {
         if ( (d->vcpu[vcpuid] != NULL) &&
              (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
             goto undo_and_fail;
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
     }
 
-    d->arch.physaddr_bitsize =
-        fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
-        + (PAGE_SIZE - 2);
+    domain_set_alloc_bitsize(d);
 
     return 0;
 
  undo_and_fail:
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
-    release_arg_xlat_area(d);
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
     while ( vcpuid-- != 0 )
     {
         if ( d->vcpu[vcpuid] != NULL )
             release_compat_l4(d->vcpu[vcpuid]);
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
     }
     return -ENOMEM;
 }
@@ -388,6 +382,9 @@ int vcpu_initialise(struct vcpu *v)
             v->arch.schedule_tail = continue_idle_domain;
             v->arch.cr3           = __pa(idle_pg_table);
         }
+
+        v->arch.guest_context.ctrlreg[4] =
+            real_cr4_to_pv_guest_cr4(mmu_cr4_features);
     }
 
     v->arch.perdomain_ptes =
@@ -401,8 +398,6 @@ void vcpu_destroy(struct vcpu *v)
     if ( is_pv_32on64_vcpu(v) )
         release_compat_l4(v);
 
-    unmap_vcpu_info(v);
-
     if ( is_hvm_vcpu(v) )
         hvm_vcpu_destroy(v);
 }
@@ -413,35 +408,28 @@ int arch_domain_create(struct domain *d)
     struct page_info *pg;
     int i;
 #endif
-    l1_pgentry_t gdt_l1e;
-    int vcpuid, pdpt_order;
+    int pdpt_order;
     int rc = -ENOMEM;
 
+    INIT_LIST_HEAD(&d->arch.pdev_list);
+
+    d->arch.relmem = RELMEM_not_started;
+    INIT_LIST_HEAD(&d->arch.relmem_list);
+
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
     d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
     if ( d->arch.mm_perdomain_pt == NULL )
         goto fail;
     memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
 
-    /*
-     * Map Xen segments into every VCPU's GDT, irrespective of whether every
-     * VCPU will actually be used. This avoids an NMI race during context
-     * switch: if we take an interrupt after switching CR3 but before switching
-     * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
-     * try to load CS from an invalid table.
-     */
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
-    for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-
 #if defined(__i386__)
 
     mapcache_init(d);
 
 #else /* __x86_64__ */
 
-    if ( (pg = alloc_domheap_page(NULL)) == NULL )
+    pg = alloc_domheap_page(NULL);
+    if (pg == NULL)
         goto fail;
     d->arch.mm_perdomain_l2 = page_to_virt(pg);
     clear_page(d->arch.mm_perdomain_l2);
@@ -450,7 +438,8 @@ int arch_domain_create(struct domain *d)
             l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
                           __PAGE_HYPERVISOR);
 
-    if ( (pg = alloc_domheap_page(NULL)) == NULL )
+    pg = alloc_domheap_page(NULL);
+    if ( pg == NULL )
         goto fail;
     d->arch.mm_perdomain_l3 = page_to_virt(pg);
     clear_page(d->arch.mm_perdomain_l3);
@@ -470,6 +459,7 @@ int arch_domain_create(struct domain *d)
     {
         d->arch.ioport_caps = 
             rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
+        rc = -ENOMEM;
         if ( d->arch.ioport_caps == NULL )
             goto fail;
 
@@ -479,12 +469,18 @@ int arch_domain_create(struct domain *d)
         clear_page(d->shared_info);
         share_xen_page_with_guest(
             virt_to_page(d->shared_info), d, XENSHARE_writable);
+
+        if ( (rc = iommu_domain_init(d)) != 0 )
+            goto fail;
     }
 
     if ( is_hvm_domain(d) )
     {
         if ( (rc = hvm_domain_initialise(d)) != 0 )
+        {
+            iommu_domain_destroy(d);
             goto fail;
+        }
     }
     else
     {
@@ -492,7 +488,6 @@ int arch_domain_create(struct domain *d)
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
             (CONFIG_PAGING_LEVELS != 4);
     }
-        
 
     return 0;
 
@@ -513,6 +508,11 @@ void arch_domain_destroy(struct domain *
     if ( is_hvm_domain(d) )
         hvm_domain_destroy(d);
 
+    pci_release_devices(d);
+    free_domain_pirqs(d);
+    if ( !is_idle_domain(d) )
+        iommu_domain_destroy(d);
+
     paging_final_teardown(d);
 
     free_xenheap_pages(
@@ -530,13 +530,29 @@ void arch_domain_destroy(struct domain *
     free_xenheap_page(d->shared_info);
 }
 
+unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
+{
+    unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
+
+    hv_cr4_mask = ~X86_CR4_TSD;
+    if ( cpu_has_de )
+        hv_cr4_mask &= ~X86_CR4_DE;
+
+    if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
+        gdprintk(XENLOG_WARNING,
+                 "Attempt to change CR4 flags %08lx -> %08lx\n",
+                 hv_cr4 & ~(X86_CR4_PGE|X86_CR4_PSE), guest_cr4);
+
+    return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
+}
+
 /* This is called by arch_final_setup_guest and do_boot_vcpu */
 int arch_set_info_guest(
     struct vcpu *v, vcpu_guest_context_u c)
 {
     struct domain *d = v->domain;
     unsigned long cr3_pfn = INVALID_MFN;
-    unsigned long flags;
+    unsigned long flags, cr4;
     int i, rc = 0, compat;
 
     /* The context is a compat-mode one if the target domain is compat-mode;
@@ -625,10 +641,28 @@ int arch_set_info_guest(
 
         /* Ensure real hardware interrupts are enabled. */
         v->arch.guest_context.user_regs.eflags |= EF_IE;
+
+        cr4 = v->arch.guest_context.ctrlreg[4];
+        v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
+            real_cr4_to_pv_guest_cr4(mmu_cr4_features);
+
     }
     else
     {
+        u32* ident_pt;
+
         hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
+        /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
+         * of virtual address space onto the same physical address range */
+        if ( v->vcpu_id == 0 )
+        {
+            ident_pt = map_domain_page(mfn_x(gfn_to_mfn(v->domain,
+                      (HVM_IDENT_PT_PAGE >> PAGE_SHIFT))));
+            for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+                ident_pt[i] = (i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
+                  | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE;
+            unmap_domain_page(ident_pt);
+        }
     }
 
     memset(v->arch.guest_context.debugreg, 0,
@@ -701,6 +735,11 @@ int arch_set_info_guest(
 
                 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
             }
+            else if ( !(flags & VGCF_in_kernel) )
+            {
+                destroy_gdt(v);
+                return -EINVAL;
+            }
 #endif
         }
 #ifdef CONFIG_COMPAT
@@ -827,7 +866,7 @@ map_vcpu_info(struct vcpu *v, unsigned l
      * lost.  The domain will get a spurious event, but it can cope.
      */
     vcpu_info(v, evtchn_upcall_pending) = 1;
-    for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
+    for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
         set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
 
     /*
@@ -893,6 +932,25 @@ arch_do_vcpu_op(
         break;
     }
 
+    case VCPUOP_get_physid:
+    {
+        struct vcpu_get_physid cpu_id;
+
+        rc = -EINVAL;
+        if ( !v->domain->is_pinned )
+            break;
+
+        cpu_id.phys_id = (x86_cpu_to_apicid[v->vcpu_id] |
+                          (acpi_get_processor_id(v->vcpu_id) << 8));
+
+        rc = -EFAULT;
+        if ( copy_to_guest(arg, &cpu_id, 1) )
+            break;
+
+        rc = 0;
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
@@ -1169,9 +1227,15 @@ static void paravirt_ctxt_switch_from(st
 
 static void paravirt_ctxt_switch_to(struct vcpu *v)
 {
+    unsigned long cr4;
+
     set_int80_direct_trap(v);
     switch_kernel_stack(v);
 
+    cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
+    if ( unlikely(cr4 != read_cr4()) )
+        write_cr4(cr4);
+
     if ( unlikely(v->arch.guest_context.debugreg[7]) )
     {
         write_debugreg(0, v->arch.guest_context.debugreg[0]);
@@ -1183,12 +1247,19 @@ static void paravirt_ctxt_switch_to(stru
     }
 }
 
+static inline int need_full_gdt(struct vcpu *v)
+{
+    return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
+}
+
 static void __context_switch(void)
 {
     struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
     unsigned int          cpu = smp_processor_id();
     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
     struct vcpu          *n = current;
+    struct desc_struct   *gdt;
+    struct desc_ptr       gdt_desc;
 
     ASSERT(p != n);
     ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
@@ -1214,14 +1285,35 @@ static void __context_switch(void)
         cpu_set(cpu, n->domain->domain_dirty_cpumask);
     cpu_set(cpu, n->vcpu_dirty_cpumask);
 
+    gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
+                                  per_cpu(compat_gdt_table, cpu);
+    if ( need_full_gdt(n) )
+    {
+        struct page_info *page = virt_to_page(gdt);
+        unsigned int i;
+        for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
+            l1e_write(n->domain->arch.mm_perdomain_pt +
+                      (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+                      FIRST_RESERVED_GDT_PAGE + i,
+                      l1e_from_page(page + i, __PAGE_HYPERVISOR));
+    }
+
+    if ( need_full_gdt(p) &&
+         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
+    {
+        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
+        gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
+        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+    }
+
     write_ptbase(n);
 
-    if ( p->vcpu_id != n->vcpu_id )
+    if ( need_full_gdt(n) &&
+         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
     {
-        char gdt_load[10];
-        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
-        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(n);
-        __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
+        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
+        gdt_desc.base  = GDT_VIRT_START(n);
+        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
     }
 
     if ( p->domain != n->domain )
@@ -1251,7 +1343,7 @@ void context_switch(struct vcpu *prev, s
     local_irq_disable();
 
     if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
-        pt_freeze_time(prev);
+        pt_save_timer(prev);
 
     set_current(next);
 
@@ -1271,9 +1363,6 @@ void context_switch(struct vcpu *prev, s
         {
             uint64_t efer = read_efer();
 
-            local_flush_tlb_one(GDT_VIRT_START(next) +
-                                FIRST_RESERVED_GDT_BYTE);
-
             if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) )
                 write_efer(efer ^ EFER_SCE);
         }
@@ -1346,6 +1435,65 @@ void sync_vcpu_execstate(struct vcpu *v)
     flush_tlb_mask(v->vcpu_dirty_cpumask);
 }
 
+struct migrate_info {
+    long (*func)(void *data);
+    void *data;
+    void (*saved_schedule_tail)(struct vcpu *);
+    cpumask_t saved_affinity;
+};
+
+static void continue_hypercall_on_cpu_helper(struct vcpu *v)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct migrate_info *info = v->arch.continue_info;
+    cpumask_t mask = info->saved_affinity;
+
+    regs->eax = info->func(info->data);
+
+    v->arch.schedule_tail = info->saved_schedule_tail;
+    v->arch.continue_info = NULL;
+
+    xfree(info);
+
+    vcpu_unlock_affinity(v, &mask);
+    schedule_tail(v);
+}
+
+int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
+{
+    struct vcpu *v = current;
+    struct migrate_info *info;
+    int rc;
+
+    if ( cpu == smp_processor_id() )
+        return func(data);
+
+    info = xmalloc(struct migrate_info);
+    if ( info == NULL )
+        return -ENOMEM;
+
+    info->func = func;
+    info->data = data;
+    info->saved_schedule_tail = v->arch.schedule_tail;
+    info->saved_affinity = cpumask_of_cpu(cpu);
+
+    v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
+    v->arch.continue_info = info;
+
+    rc = vcpu_lock_affinity(v, &info->saved_affinity);
+    if ( rc )
+    {
+        v->arch.schedule_tail = info->saved_schedule_tail;
+        v->arch.continue_info = NULL;
+        xfree(info);
+        return rc;
+    }
+
+    /* Dummy return value will be overwritten by new schedule_tail. */
+    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
+    return 0;
+}
+
 #define next_arg(fmt, args) ({                                              \
     unsigned long __arg;                                                    \
     switch ( *(fmt)++ )                                                     \
@@ -1529,12 +1677,13 @@ int hypercall_xlat_continuation(unsigned
 }
 #endif
 
-static void relinquish_memory(struct domain *d, struct list_head *list,
+static int relinquish_memory(struct domain *d, struct list_head *list,
                               unsigned long type)
 {
     struct list_head *ent;
     struct page_info  *page;
     unsigned long     x, y;
+    int               ret = 0;
 
     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
     spin_lock_recursive(&d->page_alloc_lock);
@@ -1549,44 +1698,98 @@ static void relinquish_memory(struct dom
         {
             /* Couldn't get a reference -- someone is freeing this page. */
             ent = ent->next;
+            list_move_tail(&page->list, &d->arch.relmem_list);
             continue;
         }
 
         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
-            put_page_and_type(page);
+            ret = put_page_and_type_preemptible(page, 1);
+        switch ( ret )
+        {
+        case 0:
+            break;
+        case -EAGAIN:
+        case -EINTR:
+            list_move(&page->list, list);
+            set_bit(_PGT_pinned, &page->u.inuse.type_info);
+            put_page(page);
+            goto out;
+        default:
+            BUG();
+        }
 
         if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
             put_page(page);
 
         /*
          * Forcibly invalidate top-most, still valid page tables at this point
-         * to break circular 'linear page table' references. This is okay
-         * because MMU structures are not shared across domains and this domain
-         * is now dead. Thus top-most valid tables are not in use so a non-zero
-         * count means circular reference.
+         * to break circular 'linear page table' references as well as clean up
+         * partially validated pages. This is okay because MMU structures are
+         * not shared across domains and this domain is now dead. Thus top-most
+         * valid tables are not in use so a non-zero count means circular
+         * reference or partially validated.
          */
         y = page->u.inuse.type_info;
         for ( ; ; )
         {
             x = y;
-            if ( likely((x & (PGT_type_mask|PGT_validated)) !=
-                        (type|PGT_validated)) )
+            if ( likely((x & PGT_type_mask) != type) ||
+                 likely(!(x & (PGT_validated|PGT_partial))) )
                 break;
 
-            y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
+            y = cmpxchg(&page->u.inuse.type_info, x,
+                        x & ~(PGT_validated|PGT_partial));
             if ( likely(y == x) )
             {
-                free_page_type(page, type);
+                /* No need for atomic update of type_info here: noone else updates it. */
+                switch ( ret = free_page_type(page, x, 1) )
+                {
+                case 0:
+                    break;
+                case -EINTR:
+                    list_move(&page->list, list);
+                    page->u.inuse.type_info |= PGT_validated;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    put_page(page);
+                    ret = -EAGAIN;
+                    goto out;
+                case -EAGAIN:
+                    list_move(&page->list, list);
+                    page->u.inuse.type_info |= PGT_partial;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    goto out;
+                default:
+                    BUG();
+                }
+                if ( x & PGT_partial )
+                {
+                    page->u.inuse.type_info--;
+                    put_page(page);
+                }
                 break;
             }
         }
 
-        /* Follow the list chain and /then/ potentially free the page. */
+        /* Put the page on the list and /then/ potentially free it. */
         ent = ent->next;
+        list_move_tail(&page->list, &d->arch.relmem_list);
         put_page(page);
+
+        if ( hypercall_preempt_check() )
+        {
+            ret = -EAGAIN;
+            goto out;
+        }
     }
 
+    /* list is empty at this point. */
+    list_splice_init(&d->arch.relmem_list, list);
+
+ out:
     spin_unlock_recursive(&d->page_alloc_lock);
+    return ret;
 }
 
 static void vcpu_destroy_pagetables(struct vcpu *v)
@@ -1624,10 +1827,6 @@ static void vcpu_destroy_pagetables(stru
             put_page(mfn_to_page(pfn));
         else
             put_page_and_type(mfn_to_page(pfn));
-#ifdef __x86_64__
-        if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
-            v->arch.guest_table_user = pagetable_null();
-#endif
         v->arch.guest_table = pagetable_null();
     }
 
@@ -1636,10 +1835,13 @@ static void vcpu_destroy_pagetables(stru
     pfn = pagetable_get_pfn(v->arch.guest_table_user);
     if ( pfn != 0 )
     {
-        if ( paging_mode_refcounts(d) )
-            put_page(mfn_to_page(pfn));
-        else
-            put_page_and_type(mfn_to_page(pfn));
+        if ( !is_pv_32bit_vcpu(v) )
+        {
+            if ( paging_mode_refcounts(d) )
+                put_page(mfn_to_page(pfn));
+            else
+                put_page_and_type(mfn_to_page(pfn));
+        }
         v->arch.guest_table_user = pagetable_null();
     }
 #endif
@@ -1647,43 +1849,83 @@ static void vcpu_destroy_pagetables(stru
     v->arch.cr3 = 0;
 }
 
-void domain_relinquish_resources(struct domain *d)
+int domain_relinquish_resources(struct domain *d)
 {
+    int ret;
     struct vcpu *v;
 
     BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
 
-    /* Drop the in-use references to page-table bases. */
-    for_each_vcpu ( d, v )
-        vcpu_destroy_pagetables(v);
-
-    /* Tear down paging-assistance stuff. */
-    paging_teardown(d);
-
-    /*
-     * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
-     * it automatically gets squashed when the guest's mappings go away.
-     */
-    for_each_vcpu(d, v)
-        destroy_gdt(v);
-
-    /* Relinquish every page of memory. */
+    switch ( d->arch.relmem )
+    {
+    case RELMEM_not_started:
+        /* Tear down paging-assistance stuff. */
+        paging_teardown(d);
+
+        for_each_vcpu ( d, v )
+        {
+            /* Drop the in-use references to page-table bases. */
+            vcpu_destroy_pagetables(v);
+      
+            /*
+             * Relinquish GDT mappings. No need for explicit unmapping of the
+             * LDT as it automatically gets squashed with the guest mappings.
+             */
+            destroy_gdt(v);
+
+            unmap_vcpu_info(v);
+        }
+
+        d->arch.relmem = RELMEM_xen;
+        /* fallthrough */
+
+        /* Relinquish every page of memory. */
+    case RELMEM_xen:
+        ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
+        if ( ret )
+            return ret;
 #if CONFIG_PAGING_LEVELS >= 4
-    relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
-    relinquish_memory(d, &d->page_list, PGT_l4_page_table);
+        d->arch.relmem = RELMEM_l4;
+        /* fallthrough */
+
+    case RELMEM_l4:
+        ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
+        if ( ret )
+            return ret;
 #endif
 #if CONFIG_PAGING_LEVELS >= 3
-    relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
-    relinquish_memory(d, &d->page_list, PGT_l3_page_table);
-#endif
-    relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
-    relinquish_memory(d, &d->page_list, PGT_l2_page_table);
+        d->arch.relmem = RELMEM_l3;
+        /* fallthrough */
+
+    case RELMEM_l3:
+        ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
+        if ( ret )
+            return ret;
+#endif
+        d->arch.relmem = RELMEM_l2;
+        /* fallthrough */
+
+    case RELMEM_l2:
+        ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
+        if ( ret )
+            return ret;
+        d->arch.relmem = RELMEM_done;
+        /* fallthrough */
+
+    case RELMEM_done:
+        break;
+
+    default:
+        BUG();
+    }
 
     /* Free page used by xen oprofile buffer. */
     free_xenoprof_pages(d);
 
     if ( is_hvm_domain(d) )
         hvm_domain_relinquish_resources(d);
+
+    return 0;
 }
 
 void arch_dump_domain_info(struct domain *d)
diff -Naurp xen/arch/x86/domctl.c xen-redhat/arch/x86/domctl.c
--- xen/arch/x86/domctl.c
+++ xen-redhat/arch/x86/domctl.c
@@ -24,6 +24,21 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
+#include <xen/iommu.h>
+
+typedef unsigned long kdbva_t;
+typedef unsigned char kdbbyt_t;
+extern int dbg_rw_mem(kdbva_t, kdbbyt_t *, int, domid_t, int, uint64_t);
+
+static int
+gdbsx_guest_mem_io(domid_t domid, struct xen_domctl_gdbsx_memio *iop)
+{
+    ulong l_uva = (ulong)iop->uva;
+    iop->remain = dbg_rw_mem(
+        (kdbva_t)iop->gva, (kdbbyt_t *)l_uva, iop->len, domid,
+        iop->gwr, iop->pgd3val);
+    return (iop->remain ? -EFAULT : 0);
+}
 
 long arch_do_domctl(
     struct xen_domctl *domctl,
@@ -230,10 +245,14 @@ long arch_do_domctl(
         ret = -EINVAL;
         if ( d != NULL )
         {
-            ret = 0;
-
             spin_lock(&d->page_alloc_lock);
 
+            if ( unlikely(d->is_dying) ) {
+                spin_unlock(&d->page_alloc_lock);
+                goto getmemlist_out;
+            }
+
+            ret = 0;
             list_ent = d->page_list.next;
             for ( i = 0; (i < max_pfns) && (list_ent != &d->page_list); i++ )
             {
@@ -253,6 +272,7 @@ long arch_do_domctl(
             domctl->u.getmemlist.num_pfns = i;
             copy_to_guest(u_domctl, domctl, 1);
 
+        getmemlist_out:
             rcu_unlock_domain(d);
         }
     }
@@ -382,6 +402,29 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_gethvmcontext_partial:
+    { 
+        struct domain *d;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        ret = -EINVAL;
+        if ( !is_hvm_domain(d) ) 
+            goto gethvmcontext_partial_out;
+
+        domain_pause(d);
+        ret = hvm_save_one(d, domctl->u.hvmcontext_partial.type,
+                           domctl->u.hvmcontext_partial.instance,
+                           domctl->u.hvmcontext_partial.buffer);
+        domain_unpause(d);
+
+    gethvmcontext_partial_out:
+        rcu_unlock_domain(d);
+    }
+    break;
+
     case XEN_DOMCTL_set_address_size:
     {
         struct domain *d;
@@ -417,7 +460,8 @@ long arch_do_domctl(
         if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
             break;
 
-        domctl->u.address_size.size = BITS_PER_GUEST_LONG(d);
+        domctl->u.address_size.size =
+            is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
 
         ret = 0;
         rcu_unlock_domain(d);
@@ -427,6 +471,432 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_get_device_group:
+    {
+        struct domain *d;
+        u32 max_sdevs;
+        u8 bus, devfn;
+        XEN_GUEST_HANDLE_64(uint32_t) sdevs;
+        int num_sdevs;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        bus = (domctl->u.get_device_group.machine_bdf >> 16) & 0xff;
+        devfn = (domctl->u.get_device_group.machine_bdf >> 8) & 0xff;
+        max_sdevs = domctl->u.get_device_group.max_sdevs;
+        sdevs = domctl->u.get_device_group.sdev_array;
+
+        num_sdevs = iommu_get_device_group(d, bus, devfn, sdevs, max_sdevs);
+        if ( num_sdevs < 0 )
+        {
+            dprintk(XENLOG_ERR, "iommu_get_device_group() failed!\n");
+            ret = -EFAULT;
+            domctl->u.get_device_group.num_sdevs = 0;
+        }
+        else
+        {
+            ret = 0;
+            domctl->u.get_device_group.num_sdevs = num_sdevs;
+        }
+        if ( copy_to_guest(u_domctl, domctl, 1) )
+            ret = -EFAULT;
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_test_assign_device:
+    {
+        u8 bus, devfn;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
+        devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;
+
+        if ( device_assigned(bus, devfn) )
+        {
+            gdprintk(XENLOG_ERR, "XEN_DOMCTL_test_assign_device: "
+                     "%x:%x:%x already assigned, or non-existent\n",
+                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+            break;
+        }
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_assign_device:
+    {
+        struct domain *d;
+        u8 bus, devfn;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        if ( unlikely((d = get_domain_by_id(domctl->domain)) == NULL) )
+        {
+            gdprintk(XENLOG_ERR,
+                "XEN_DOMCTL_assign_device: get_domain_by_id() failed\n");
+            break;
+        }
+
+        bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
+        devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;
+
+        ret = assign_device(d, bus, devfn);
+        if ( ret )
+            gdprintk(XENLOG_ERR, "XEN_DOMCTL_assign_device: "
+                     "assign device (%x:%x:%x) failed\n",
+                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+        put_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_deassign_device:
+    {
+        struct domain *d;
+        u8 bus, devfn;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        if ( unlikely((d = get_domain_by_id(domctl->domain)) == NULL) )
+        {
+            gdprintk(XENLOG_ERR,
+                "XEN_DOMCTL_deassign_device: get_domain_by_id() failed\n");
+            break;
+        }
+
+        bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
+        devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;
+
+        spin_lock(&pcidevs_lock);
+        ret = deassign_device(d, bus, devfn);
+        spin_unlock(&pcidevs_lock);
+        gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+        put_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_bind_pt_irq:
+    {
+        struct domain * d;
+        xen_domctl_bind_pt_irq_t * bind;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+        bind = &(domctl->u.bind_pt_irq);
+
+        ret = -ESRCH;
+        if ( iommu_enabled )
+        {
+            spin_lock(&pcidevs_lock);
+            ret = pt_irq_create_bind_vtd(d, bind);
+            spin_unlock(&pcidevs_lock);
+        }
+        if ( ret < 0 )
+            gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n");
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_unbind_pt_irq:
+    {
+        struct domain * d;
+        xen_domctl_bind_pt_irq_t * bind;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+        bind = &(domctl->u.bind_pt_irq);
+        if ( iommu_enabled )
+        {
+            spin_lock(&pcidevs_lock);
+            ret = pt_irq_destroy_bind_vtd(d, bind);
+            spin_unlock(&pcidevs_lock);
+        }
+        if ( ret < 0 )
+            gdprintk(XENLOG_ERR, "pt_irq_destroy_bind failed!\n");
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_memory_mapping:
+    {
+        struct domain *d;
+        unsigned long gfn = domctl->u.memory_mapping.first_gfn;
+        unsigned long mfn = domctl->u.memory_mapping.first_mfn;
+        unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
+        int i;
+
+        ret = -EINVAL;
+        if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */
+            break;
+
+        ret = -ESRCH;
+        if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) )
+            break;
+
+        ret=0;
+        if ( domctl->u.memory_mapping.add_mapping )
+        {
+            gdprintk(XENLOG_INFO,
+                "memory_map:add: gfn=%lx mfn=%lx nr_mfns=%lx\n",
+                gfn, mfn, nr_mfns);
+
+            ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
+            for ( i = 0; i < nr_mfns; i++ )
+                set_mmio_p2m_entry(d, gfn+i, _mfn(mfn+i));
+        }
+        else
+        {
+            gdprintk(XENLOG_INFO,
+                "memory_map:remove: gfn=%lx mfn=%lx nr_mfns=%lx\n",
+                 gfn, mfn, nr_mfns);
+
+            for ( i = 0; i < nr_mfns; i++ )
+                clear_mmio_p2m_entry(d, gfn+i);
+            ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
+        }
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_ioport_mapping:
+    {
+#define MAX_IOPORTS    0x10000
+        struct domain *d;
+        struct hvm_iommu *hd;
+        unsigned int fgp = domctl->u.ioport_mapping.first_gport;
+        unsigned int fmp = domctl->u.ioport_mapping.first_mport;
+        unsigned int np = domctl->u.ioport_mapping.nr_ports;
+        struct g2m_ioport *g2m_ioport;
+        int found = 0;
+
+        ret = -EINVAL;
+        if ( (np == 0) || (fgp > MAX_IOPORTS) || (fmp > MAX_IOPORTS) ||
+            ((fgp + np) > MAX_IOPORTS) || ((fmp + np) > MAX_IOPORTS) )
+        {
+            gdprintk(XENLOG_ERR,
+                "ioport_map:invalid:gport=%x mport=%x nr_ports=%x\n",
+                fgp, fmp, np);
+            break;
+        }
+
+        ret = -ESRCH;
+        if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) )
+            break;
+
+        hd = domain_hvm_iommu(d);
+        if ( domctl->u.ioport_mapping.add_mapping )
+        {
+            gdprintk(XENLOG_INFO,
+                "ioport_map:add f_gport=%x f_mport=%x np=%x\n",
+                fgp, fmp, np);
+
+            list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list)
+                if (g2m_ioport->mport == fmp )
+                {
+                    g2m_ioport->gport = fgp;
+                    g2m_ioport->np = np;
+                    found = 1;
+                    break;
+                }
+            if ( !found )
+            {
+                g2m_ioport = xmalloc(struct g2m_ioport);
+                g2m_ioport->gport = fgp;
+                g2m_ioport->mport = fmp;
+                g2m_ioport->np = np;
+                list_add_tail(&g2m_ioport->list, &hd->g2m_ioport_list);
+            }
+            ret = ioports_permit_access(d, fmp, fmp + np - 1);
+        }
+        else
+        {
+            gdprintk(XENLOG_INFO,
+                "ioport_map:remove f_gport=%x f_mport=%x np=%x\n",
+                fgp, fmp, np);
+            list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list)
+                if ( g2m_ioport->mport == fmp )
+                {
+                    list_del(&g2m_ioport->list);
+                    xfree(g2m_ioport);
+                    break;
+                }
+            ret = ioports_deny_access(d, fmp, fmp + np - 1);
+        }
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_sendtrigger:
+    {
+        struct domain *d;
+        struct vcpu *v;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        ret = -EINVAL;
+        if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS )
+            goto sendtrigger_out;
+
+        ret = -ESRCH;
+        if ( (v = d->vcpu[domctl->u.sendtrigger.vcpu]) == NULL )
+            goto sendtrigger_out;
+
+        switch ( domctl->u.sendtrigger.trigger )
+        {
+        case XEN_DOMCTL_SENDTRIGGER_NMI:
+        {
+            ret = -ENOSYS;
+            if ( !is_hvm_domain(d) )
+                break;
+
+            ret = 0;
+            if ( !test_and_set_bool(v->arch.hvm_vcpu.nmi_pending) )
+                vcpu_kick(v);
+        }
+        break;
+
+        default:
+            ret = -ENOSYS;
+        }
+
+    sendtrigger_out:
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_gdbsx_guestmemio:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        domctl->u.gdbsx_guest_memio.remain =
+            domctl->u.gdbsx_guest_memio.len;
+
+        ret = gdbsx_guest_mem_io(domctl->domain, &domctl->u.gdbsx_guest_memio);
+        if ( !ret && copy_to_guest(u_domctl, domctl, 1) )
+            ret = -EFAULT;
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_gdbsx_pausevcpu:
+    {
+        struct domain *d;
+        struct vcpu *v;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        ret = -EBUSY;
+        if ( !d->is_paused_by_controller )
+        {
+            rcu_unlock_domain(d);
+            break;
+        }
+        ret = -EINVAL;
+        if ( domctl->u.gdbsx_pauseunp_vcpu.vcpu >= MAX_VIRT_CPUS ||
+             (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL )
+        {
+            rcu_unlock_domain(d);
+            break;
+        }
+        vcpu_pause(v);
+        ret = 0;
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_gdbsx_unpausevcpu:
+    {
+        struct domain *d;
+        struct vcpu *v;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        ret = -EBUSY;
+        if ( !d->is_paused_by_controller )
+        {
+            rcu_unlock_domain(d);
+            break;
+        }
+        ret = -EINVAL;
+        if ( domctl->u.gdbsx_pauseunp_vcpu.vcpu >= MAX_VIRT_CPUS ||
+             (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL )
+        {
+            rcu_unlock_domain(d);
+            break;
+        }
+        if ( !atomic_read(&v->pause_count) )
+            printk("WARN: Unpausing vcpu:%d which is not paused\n", v->vcpu_id);
+        vcpu_unpause(v);
+        ret = 0;
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_gdbsx_domstatus:
+    {
+        struct domain *d;
+        struct vcpu *v;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        domctl->u.gdbsx_domstatus.vcpu_id = -1;
+        domctl->u.gdbsx_domstatus.paused = d->is_paused_by_controller;
+        if ( domctl->u.gdbsx_domstatus.paused )
+        {
+            for_each_vcpu ( d, v )
+            {
+                if ( v->arch.gdbsx_vcpu_event )
+                {
+                    domctl->u.gdbsx_domstatus.vcpu_id = v->vcpu_id;
+                    domctl->u.gdbsx_domstatus.vcpu_ev =
+                        v->arch.gdbsx_vcpu_event;
+                    v->arch.gdbsx_vcpu_event = 0;
+                    break;
+                }
+            }
+        }
+        ret = 0;
+        if ( copy_to_guest(u_domctl, domctl, 1) )
+            ret = -EFAULT;
+        rcu_unlock_domain(d);
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
@@ -485,9 +955,9 @@ void arch_get_info_guest(struct vcpu *v,
             c.nat->ctrlreg[3] = xen_pfn_to_cr3(
                 pagetable_get_pfn(v->arch.guest_table));
 #ifdef __x86_64__
-            if ( !pagetable_is_null(v->arch.guest_table_user) )
-                c.nat->ctrlreg[1] = xen_pfn_to_cr3(
-                    pagetable_get_pfn(v->arch.guest_table_user));
+            c.nat->ctrlreg[1] =
+                pagetable_is_null(v->arch.guest_table_user) ? 0
+                : xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table_user));
 #endif
         }
 #ifdef CONFIG_COMPAT
diff -Naurp xen/arch/x86/e820.c xen-redhat/arch/x86/e820.c
--- xen/arch/x86/e820.c
+++ xen-redhat/arch/x86/e820.c
@@ -2,6 +2,7 @@
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/compat.h>
+#include <xen/dmi.h>
 #include <asm/e820.h>
 #include <asm/page.h>
 
@@ -367,6 +368,15 @@ static void __init clip_mem(void)
     }
 }
 
+static void __init reserve_dmi_region(void)
+{
+    u32 base, len;
+    if ( (dmi_get_table(&base, &len) == 0) && ((base + len) > base) &&
+         reserve_e820_ram(&e820, base, base + len) )
+        printk("WARNING: DMI table located in E820 RAM %08x-%08x. Fixed.\n",
+               base, base+len);
+}
+
 static void __init machine_specific_memory_setup(
     struct e820entry *raw, int *raw_nr)
 {
@@ -376,6 +386,73 @@ static void __init machine_specific_memo
     (void)copy_e820_map(raw, nr);
     clip_4gb();
     clip_mem();
+    reserve_dmi_region();
+}
+
+/* Reserve RAM area (@s,@e) in the specified e820 map. */
+int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
+{
+    uint64_t rs = 0, re = 0;
+    int i;
+
+    for ( i = 0; i < e820->nr_map; i++ )
+    {
+        /* Have we found the e820 region that includes the specified range? */
+        rs = e820->map[i].addr;
+        re = rs + e820->map[i].size;
+        if ( (s >= rs) && (e <= re) )
+            break;
+    }
+
+    if ( (i == e820->nr_map) || (e820->map[i].type != E820_RAM) )
+        return 0;
+
+    if ( (s == rs) && (e == re) )
+    {
+        /* Complete excision. */
+        memmove(&e820->map[i], &e820->map[i+1],
+                (e820->nr_map-i-1) * sizeof(e820->map[0]));
+        e820->nr_map--;
+    }
+    else if ( s == rs )
+    {
+        /* Truncate start. */
+        e820->map[i].addr += e - s;
+        e820->map[i].size -= e - s;
+    }
+    else if ( e == re )
+    {
+        /* Truncate end. */
+        e820->map[i].size -= e - s;
+    }
+    else if ( e820->nr_map < ARRAY_SIZE(e820->map) )
+    {
+        /* Split in two. */
+        memmove(&e820->map[i+1], &e820->map[i],
+                (e820->nr_map-i) * sizeof(e820->map[0]));
+        e820->nr_map++;
+        e820->map[i].size = s - rs;
+        i++;
+        e820->map[i].addr = e;
+        e820->map[i].size = re - e;
+    }
+    else
+    {
+        /* e820map is at maximum size. We have to leak some space. */
+        if ( (s - rs) > (re - e) )
+        {
+            printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", e, re);
+            e820->map[i].size = s - rs;
+        }
+        else
+        {
+            printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", rs, s);
+            e820->map[i].addr = e;
+            e820->map[i].size = re - e;
+        }
+    }
+
+    return 1;
 }
 
 unsigned long __init init_e820(
diff -Naurp xen/arch/x86/flushtlb.c xen-redhat/arch/x86/flushtlb.c
--- xen/arch/x86/flushtlb.c
+++ xen-redhat/arch/x86/flushtlb.c
@@ -83,9 +83,12 @@ void write_cr3(unsigned long cr3)
     hvm_flush_guest_tlbs();
 
 #ifdef USER_MAPPINGS_ARE_GLOBAL
-    __pge_off();
-    __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
-    __pge_on();
+    {
+        unsigned long cr4 = read_cr4();
+        write_cr4(cr4 & ~X86_CR4_PGE);
+        asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
+        write_cr4(cr4);
+    }
 #else
     __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
 #endif
@@ -108,8 +111,12 @@ void local_flush_tlb(void)
     hvm_flush_guest_tlbs();
 
 #ifdef USER_MAPPINGS_ARE_GLOBAL
-    __pge_off();
-    __pge_on();
+    {
+            unsigned long cr4 = read_cr4();
+            write_cr4(cr4 & ~X86_CR4_PGE);
+            barrier();
+            write_cr4(cr4);
+    }
 #else
     __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" );
 #endif
diff -Naurp xen/arch/x86/genapic/es7000plat.c xen-redhat/arch/x86/genapic/es7000plat.c
--- xen/arch/x86/genapic/es7000plat.c
+++ xen-redhat/arch/x86/genapic/es7000plat.c
@@ -36,6 +36,7 @@
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/apicdef.h>
+#include <asm/processor.h>
 #include "es7000.h"
 
 /*
diff -Naurp xen/arch/x86/hvm/hpet.c xen-redhat/arch/x86/hvm/hpet.c
--- xen/arch/x86/hvm/hpet.c
+++ xen-redhat/arch/x86/hvm/hpet.c
@@ -24,6 +24,12 @@
 #include <xen/sched.h>
 #include <xen/event.h>
 
+#define domain_vhpet(x) (&(x)->arch.hvm_domain.pl_time.vhpet)
+#define vcpu_vhpet(x)   (domain_vhpet((x)->domain))
+#define vhpet_domain(x) (container_of((x), struct domain, \
+                                      arch.hvm_domain.pl_time.vhpet))
+#define vhpet_vcpu(x)   (pt_global_vcpu_target(vhpet_domain(x)))
+
 #define HPET_BASE_ADDRESS   0xfed00000ULL
 #define HPET_MMAP_SIZE      1024
 #define S_TO_NS  1000000000ULL           /* 1s  = 10^9  ns */
@@ -31,7 +37,7 @@
 
 /* Frequency_of_TSC / frequency_of_HPET = 32 */
 #define TSC_PER_HPET_TICK 32
-#define guest_time_hpet(v) (hvm_get_guest_time(v) / TSC_PER_HPET_TICK)
+#define guest_time_hpet(hpet) (hvm_get_guest_tsc(vhpet_vcpu(hpet)) / TSC_PER_HPET_TICK)
 
 #define HPET_ID         0x000
 #define HPET_PERIOD     0x004
@@ -71,8 +77,9 @@
 #define HPET_TN_INT_ROUTE_CAP_MASK (0xffffffffULL \
                     << HPET_TN_INT_ROUTE_CAP_SHIFT)
 
-#define hpet_tick_to_ns(h, tick) ((s_time_t)(tick)* \
-                                  (S_TO_NS*TSC_PER_HPET_TICK)/h->tsc_freq)
+#define hpet_tick_to_ns(h, tick)                        \
+    ((s_time_t)((((tick) > (h)->hpet_to_ns_limit) ?     \
+        ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10))
 
 #define timer_config(h, n)       (h->hpet.timers[n].config)
 #define timer_enabled(h, n)      (timer_config(h, n) & HPET_TN_ENABLE)
@@ -116,22 +123,26 @@ static inline uint64_t hpet_read_maincou
     ASSERT(spin_is_locked(&h->lock));
 
     if ( hpet_enabled(h) )
-        return guest_time_hpet(h->vcpu) + h->mc_offset;
+        return guest_time_hpet(h) + h->mc_offset;
     else 
         return h->hpet.mc64;
 }
 
-static unsigned long hpet_read(
-    struct vcpu *v, unsigned long addr, unsigned long length)
+static int hpet_read(
+    struct vcpu *v, unsigned long addr, unsigned long length,
+    unsigned long *pval)
 {
-    HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
+    HPETState *h = vcpu_vhpet(v);
     unsigned long result;
     uint64_t val;
 
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
-        return ~0UL;
+    {
+        result = ~0ul;
+        goto out;
+    }
 
     spin_lock(&h->lock);
 
@@ -145,7 +156,9 @@ static unsigned long hpet_read(
 
     spin_unlock(&h->lock);
 
-    return result;
+ out:
+    *pval = result;
+    return 1;
 }
 
 static void hpet_stop_timer(HPETState *h, unsigned int tn)
@@ -173,7 +186,7 @@ static void hpet_set_timer(HPETState *h,
     {
         /* HPET specification requires PIT shouldn't generate
          * interrupts if LegacyReplacementRoute is set for timer0 */
-        PITState *pit = &h->vcpu->domain->arch.hvm_domain.pl_time.vpit;
+        PITState *pit = &vhpet_domain(h)->arch.hvm_domain.pl_time.vpit;
         pit_stop_channel0_irq(pit);
     }
 
@@ -208,18 +221,18 @@ static inline uint64_t hpet_fixup_reg(
     return new;
 }
 
-static void hpet_write(
+static int hpet_write(
     struct vcpu *v, unsigned long addr,
     unsigned long length, unsigned long val)
 {
-    HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
+    HPETState *h = vcpu_vhpet(v);
     uint64_t old_val, new_val;
     int tn, i;
 
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
-        return;
+        goto out;
 
     spin_lock(&h->lock);
 
@@ -241,14 +254,14 @@ static void hpet_write(
         if ( !(old_val & HPET_CFG_ENABLE) && (new_val & HPET_CFG_ENABLE) )
         {
             /* Enable main counter and interrupt generation. */
-            h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu);
+            h->mc_offset = h->hpet.mc64 - guest_time_hpet(h);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
                 hpet_set_timer(h, i); 
         }
         else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) )
         {
             /* Halt main counter and disable interrupt generation. */
-            h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu);
+            h->hpet.mc64 = h->mc_offset + guest_time_hpet(h);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
                 hpet_stop_timer(h, i);
         }
@@ -314,11 +327,15 @@ static void hpet_write(
     }
 
     spin_unlock(&h->lock);
+
+ out:
+    return 1;
 }
 
 static int hpet_range(struct vcpu *v, unsigned long addr)
 {
-    return ((addr >= HPET_BASE_ADDRESS) &&
+    return (v->domain->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] &&
+            (addr >= HPET_BASE_ADDRESS) &&
             (addr < (HPET_BASE_ADDRESS + HPET_MMAP_SIZE)));
 }
 
@@ -331,7 +348,7 @@ struct hvm_mmio_handler hpet_mmio_handle
 static void hpet_route_interrupt(HPETState *h, unsigned int tn)
 {
     unsigned int tn_int_route = timer_int_route(h, tn);
-    struct domain *d = h->vcpu->domain;
+    struct domain *d = vhpet_domain(h);
 
     ASSERT(spin_is_locked(&h->lock));
 
@@ -399,25 +416,25 @@ static void hpet_timer_fn(void *opaque)
 
 void hpet_migrate_timers(struct vcpu *v)
 {
-    struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
+    struct HPETState *h = vcpu_vhpet(v);
     int i;
 
-    if ( v != h->vcpu )
+    if (v != vhpet_vcpu (h))
         return;
-
+    
     for ( i = 0; i < HPET_TIMER_NUM; i++ )
         migrate_timer(&h->timers[i], v->processor);
 }
 
 static int hpet_save(struct domain *d, hvm_domain_context_t *h)
 {
-    HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
+    HPETState *hp = domain_vhpet(d);
     int rc;
 
     spin_lock(&hp->lock);
 
     /* Write the proper value into the main counter */
-    hp->hpet.mc64 = hp->mc_offset + guest_time_hpet(hp->vcpu);
+    hp->hpet.mc64 = hp->mc_offset + guest_time_hpet(hp);
 
     /* Save the HPET registers */
     rc = hvm_save_entry(HPET, 0, h, &hp->hpet);
@@ -429,7 +446,7 @@ static int hpet_save(struct domain *d, h
 
 static int hpet_load(struct domain *d, hvm_domain_context_t *h)
 {
-    HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
+    HPETState *hp = domain_vhpet(d);
     int i;
 
     spin_lock(&hp->lock);
@@ -442,7 +459,7 @@ static int hpet_load(struct domain *d, h
     }
     
     /* Recalculate the offset between the main counter and guest time */
-    hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu);
+    hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp);
                 
     /* Restart the timers */
     for ( i = 0; i < HPET_TIMER_NUM; i++ )
@@ -457,16 +474,18 @@ HVM_REGISTER_SAVE_RESTORE(HPET, hpet_sav
 
 void hpet_init(struct vcpu *v)
 {
-    HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
+    HPETState *h = vcpu_vhpet(v);
     int i;
 
     memset(h, 0, sizeof(HPETState));
 
     spin_lock_init(&h->lock);
 
-    h->vcpu = v;
     h->tsc_freq = ticks_per_sec(v);
 
+    h->hpet_to_ns_scale = ((S_TO_NS * TSC_PER_HPET_TICK) << 10) / h->tsc_freq;
+    h->hpet_to_ns_limit = (~0ULL >> 1) / h->hpet_to_ns_scale;
+
     /* 64-bit main counter; 3 timers supported; LegacyReplacementRoute. */
     h->hpet.capability = 0x8086A201ULL;
 
@@ -489,7 +508,7 @@ void hpet_init(struct vcpu *v)
 void hpet_deinit(struct domain *d)
 {
     int i;
-    HPETState *h = &d->arch.hvm_domain.pl_time.vhpet;
+    HPETState *h = domain_vhpet(d);
 
     for ( i = 0; i < HPET_TIMER_NUM; i++ )
         kill_timer(&h->timers[i]);
diff -Naurp xen/arch/x86/hvm/hvm.c xen-redhat/arch/x86/hvm/hvm.c
--- xen/arch/x86/hvm/hvm.c
+++ xen-redhat/arch/x86/hvm/hvm.c
@@ -49,6 +49,16 @@
 #include <public/version.h>
 #include <public/memory.h>
 
+/*
+ * Xen command-line option to allow/disallow hardware-assisted paging.
+ * Since the phys-to-machine table of AMD NPT is in host format, 32-bit Xen
+ * can only support guests using NPT with up to a 4GB memory map. Therefore
+ * we disallow HAP by default on PAE Xen (by default we want to support an
+ * 8GB pseudophysical memory map for HVM guests on a PAE host).
+ */
+static int opt_hap_permitted = (CONFIG_PAGING_LEVELS != 3);
+boolean_param("hap", opt_hap_permitted);
+
 int hvm_enabled __read_mostly;
 
 unsigned int opt_hvm_debug_level __read_mostly;
@@ -74,6 +84,14 @@ void hvm_enable(struct hvm_function_tabl
 
     hvm_funcs   = *fns;
     hvm_enabled = 1;
+
+    if ( hvm_funcs.hap_supported )
+    {
+        if ( !opt_hap_permitted )
+            hvm_funcs.hap_supported = 0;
+        printk("HVM: Hardware Assisted Paging detected %s.\n",
+               hvm_funcs.hap_supported ? "and enabled" : "but disabled");
+    }
 }
 
 void hvm_disable(void)
@@ -89,17 +107,17 @@ void hvm_stts(struct vcpu *v)
         hvm_funcs.stts(v);
 }
 
-void hvm_set_guest_time(struct vcpu *v, u64 gtime)
+void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
 {
     u64 host_tsc;
 
     rdtscll(host_tsc);
 
-    v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc;
+    v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
     hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
 }
 
-u64 hvm_get_guest_time(struct vcpu *v)
+u64 hvm_get_guest_tsc(struct vcpu *v)
 {
     u64 host_tsc;
 
@@ -120,7 +138,7 @@ void hvm_do_resume(struct vcpu *v)
 
     hvm_stts(v);
 
-    pt_thaw_time(v);
+    pt_restore_timer(v);
 
     /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
     p = &get_ioreq(v)->vp_ioreq;
@@ -222,10 +240,19 @@ int hvm_domain_initialise(struct domain 
         return -EINVAL;
     }
 
+    d->arch.hvm_domain.vmx_apic_access_mfn = INVALID_MFN;
+
     spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
     spin_lock_init(&d->arch.hvm_domain.irq_lock);
     spin_lock_init(&d->arch.hvm_domain.vapic_access_lock);
 
+    INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
+    spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    hvm_init_guest_time(d);
+
+    d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
+
     rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
     if ( rc != 0 )
         return rc;
@@ -236,14 +263,21 @@ int hvm_domain_initialise(struct domain 
     hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
     hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
 
-    return 0;
+    if ( hvm_funcs.domain_initialise )
+        rc = hvm_funcs.domain_initialise(d);
+
+    return rc;
 }
 
+extern void msixtbl_pt_cleanup(struct domain *d);
+
 void hvm_domain_relinquish_resources(struct domain *d)
 {
     hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
     hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
 
+    msixtbl_pt_cleanup(d);
+
     pit_deinit(d);
     rtc_deinit(d);
     pmtimer_deinit(d);
@@ -252,6 +286,8 @@ void hvm_domain_relinquish_resources(str
 
 void hvm_domain_destroy(struct domain *d)
 {
+    if ( hvm_funcs.domain_destroy )
+        hvm_funcs.domain_destroy(d);
 }
 
 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
@@ -446,7 +482,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
     hpet_init(v);
  
     /* Init guest TSC to start from zero. */
-    hvm_set_guest_time(v, 0);
+    hvm_set_guest_tsc(v, 0);
 
     return 0;
 }
@@ -553,6 +589,403 @@ void hvm_triple_fault(void)
     domain_shutdown(v->domain, SHUTDOWN_reboot);
 }
 
+int hvm_virtual_to_linear_addr(
+    enum x86_segment seg,
+    struct segment_register *reg,
+    unsigned long offset,
+    unsigned int bytes,
+    enum hvm_access_type access_type,
+    unsigned int addr_size,
+    unsigned long *linear_addr)
+{
+    unsigned long addr = offset;
+    uint32_t last_byte;
+
+    if ( addr_size != 64 )
+    {
+        /*
+         * COMPATIBILITY MODE: Apply segment checks and add base.
+         */
+
+        switch ( access_type )
+        {
+        case hvm_access_read:
+            if ( (reg->attr.fields.type & 0xa) == 0x8 )
+                goto gpf; /* execute-only code segment */
+            break;
+        case hvm_access_write:
+            if ( (reg->attr.fields.type & 0xa) != 0x2 )
+                goto gpf; /* not a writable data segment */
+            break;
+        default:
+            break;
+        }
+
+        last_byte = offset + bytes - 1;
+
+        /* Is this a grows-down data segment? Special limit check if so. */
+        if ( (reg->attr.fields.type & 0xc) == 0x4 )
+        {
+            /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
+            if ( !reg->attr.fields.db )
+                last_byte = (uint16_t)last_byte;
+
+            /* Check first byte and last byte against respective bounds. */
+            if ( (offset <= reg->limit) || (last_byte < offset) )
+                goto gpf;
+        }
+        else if ( (last_byte > reg->limit) || (last_byte < offset) )
+            goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
+
+        /*
+         * Hardware truncates to 32 bits in compatibility mode.
+         * It does not truncate to 16 bits in 16-bit address-size mode.
+         */
+        addr = (uint32_t)(addr + reg->base);
+    }
+    else
+    {
+        /*
+         * LONG MODE: FS and GS add segment base. Addresses must be canonical.
+         */
+
+        if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
+            addr += reg->base;
+
+        if ( !is_canonical_address(addr) )
+            goto gpf;
+    }
+
+    *linear_addr = addr;
+    return 1;
+
+ gpf:
+    return 0;
+}
+
+static void *hvm_map(unsigned long va, int size)
+{
+    unsigned long gfn, mfn;
+
+    if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
+    {
+        hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
+                             (va + PAGE_SIZE - 1) & PAGE_MASK);
+        return NULL;
+    }
+
+    gfn = paging_gva_to_gfn(current, va);
+    mfn = mfn_x(gfn_to_mfn_current(gfn));
+
+    ASSERT(mfn_valid(mfn));
+
+    paging_mark_dirty(current->domain, mfn);
+
+    return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
+}
+
+static void hvm_unmap(void *p)
+{
+    if ( p )
+        unmap_domain_page(p);
+}
+
+static int hvm_load_segment_selector(
+    struct vcpu *v, enum x86_segment seg, uint16_t sel)
+{
+    struct segment_register desctab, cs, segr;
+    struct desc_struct *pdesc, desc;
+    u8 dpl, rpl, cpl;
+    int fault_type = TRAP_invalid_tss;
+
+    /* NULL selector? */
+    if ( (sel & 0xfffc) == 0 )
+    {
+        if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
+            goto fail;
+        memset(&segr, 0, sizeof(segr));
+        hvm_set_segment_register(v, seg, &segr);
+        return 0;
+    }
+
+    /* LDT descriptor must be in the GDT. */
+    if ( (seg == x86_seg_ldtr) && (sel & 4) )
+        goto fail;
+
+    hvm_get_segment_register(v, x86_seg_cs, &cs);
+    hvm_get_segment_register(
+        v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
+
+    /* Check against descriptor table limit. */
+    if ( ((sel & 0xfff8) + 7) > desctab.limit )
+        goto fail;
+
+    pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
+    if ( pdesc == NULL )
+        goto hvm_map_fail;
+
+    do {
+        desc = *pdesc;
+
+        /* Segment present in memory? */
+        if ( !(desc.b & (1u<<15)) )
+        {
+            fault_type = TRAP_no_segment;
+            goto unmap_and_fail;
+        }
+
+        /* LDT descriptor is a system segment. All others are code/data. */
+        if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
+            goto unmap_and_fail;
+
+        dpl = (desc.b >> 13) & 3;
+        rpl = sel & 3;
+        cpl = cs.sel & 3;
+
+        switch ( seg )
+        {
+        case x86_seg_cs:
+            /* Code segment? */
+            if ( !(desc.b & (1u<<11)) )
+                goto unmap_and_fail;
+            /* Non-conforming segment: check DPL against RPL. */
+            if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
+                goto unmap_and_fail;
+            break;
+        case x86_seg_ss:
+            /* Writable data segment? */
+            if ( (desc.b & (5u<<9)) != (1u<<9) )
+                goto unmap_and_fail;
+            if ( (dpl != cpl) || (dpl != rpl) )
+                goto unmap_and_fail;
+            break;
+        case x86_seg_ldtr:
+            /* LDT system segment? */
+            if ( (desc.b & (15u<<8)) != (2u<<8) )
+                goto unmap_and_fail;
+            goto skip_accessed_flag;
+        default:
+            /* Readable code or data segment? */
+            if ( (desc.b & (5u<<9)) == (4u<<9) )
+                goto unmap_and_fail;
+            /* Non-conforming segment: check DPL against RPL and CPL. */
+            if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
+                goto unmap_and_fail;
+            break;
+        }
+    } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
+
+    /* Force the Accessed flag in our local copy. */
+    desc.b |= 0x100;
+
+ skip_accessed_flag:
+    hvm_unmap(pdesc);
+
+    segr.base = (((desc.b <<  0) & 0xff000000u) |
+                 ((desc.b << 16) & 0x00ff0000u) |
+                 ((desc.a >> 16) & 0x0000ffffu));
+    segr.attr.bytes = (((desc.b >>  8) & 0x00ffu) |
+                       ((desc.b >> 12) & 0x0f00u));
+    segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
+    if ( segr.attr.fields.g )
+        segr.limit = (segr.limit << 12) | 0xfffu;
+    segr.sel = sel;
+    hvm_set_segment_register(v, seg, &segr);
+
+    return 0;
+
+ unmap_and_fail:
+    hvm_unmap(pdesc);
+ fail:
+    hvm_inject_exception(fault_type, sel & 0xfffc, 0);
+ hvm_map_fail:
+    return 1;
+}
+
+void hvm_task_switch(
+    uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
+    int32_t errcode)
+{
+    struct vcpu *v = current;
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct segment_register gdt, tr, prev_tr, segr;
+    struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    unsigned long eflags;
+    int exn_raised;
+    struct {
+        u16 back_link,__blh;
+        u32 esp0;
+        u16 ss0, _0;
+        u32 esp1;
+        u16 ss1, _1;
+        u32 esp2;
+        u16 ss2, _2;
+        u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
+        u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
+        u16 trace, iomap;
+    } *ptss, tss;
+
+    hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
+    hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
+
+    if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
+    {
+        hvm_inject_exception((taskswitch_reason == TSW_iret) ?
+                             TRAP_invalid_tss : TRAP_gp_fault,
+                             tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
+    if ( optss_desc == NULL )
+        goto out;
+
+    nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
+    if ( nptss_desc == NULL )
+        goto out;
+
+    tss_desc = *nptss_desc;
+    tr.sel = tss_sel;
+    tr.base = (((tss_desc.b <<  0) & 0xff000000u) |
+               ((tss_desc.b << 16) & 0x00ff0000u) |
+               ((tss_desc.a >> 16) & 0x0000ffffu));
+    tr.attr.bytes = (((tss_desc.b >>  8) & 0x00ffu) |
+                     ((tss_desc.b >> 12) & 0x0f00u));
+    tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
+    if ( tr.attr.fields.g )
+        tr.limit = (tr.limit << 12) | 0xfffu;
+
+    if ( !tr.attr.fields.p )
+    {
+        hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
+    {
+        hvm_inject_exception(
+            (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
+            tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
+    {
+        hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
+        goto out;
+    }
+
+    hvm_store_cpu_guest_regs(v, regs, NULL);
+
+    ptss = hvm_map(prev_tr.base, sizeof(tss));
+    if ( ptss == NULL )
+        goto out;
+
+    eflags = regs->eflags;
+    if ( taskswitch_reason == TSW_iret )
+        eflags &= ~X86_EFLAGS_NT;
+
+    ptss->cr3    = hvm_get_guest_ctrl_reg(v, 3);
+    ptss->eip    = regs->eip;
+    ptss->eflags = eflags;
+    ptss->eax    = regs->eax;
+    ptss->ecx    = regs->ecx;
+    ptss->edx    = regs->edx;
+    ptss->ebx    = regs->ebx;
+    ptss->esp    = regs->esp;
+    ptss->ebp    = regs->ebp;
+    ptss->esi    = regs->esi;
+    ptss->edi    = regs->edi;
+
+    hvm_get_segment_register(v, x86_seg_es, &segr);
+    ptss->es = segr.sel;
+    hvm_get_segment_register(v, x86_seg_cs, &segr);
+    ptss->cs = segr.sel;
+    hvm_get_segment_register(v, x86_seg_ss, &segr);
+    ptss->ss = segr.sel;
+    hvm_get_segment_register(v, x86_seg_ds, &segr);
+    ptss->ds = segr.sel;
+    hvm_get_segment_register(v, x86_seg_fs, &segr);
+    ptss->fs = segr.sel;
+    hvm_get_segment_register(v, x86_seg_gs, &segr);
+    ptss->gs = segr.sel;
+    hvm_get_segment_register(v, x86_seg_ldtr, &segr);
+    ptss->ldt = segr.sel;
+
+    hvm_unmap(ptss);
+
+    ptss = hvm_map(tr.base, sizeof(tss));
+    if ( ptss == NULL )
+        goto out;
+
+    if ( hvm_set_cr3(ptss->cr3) )
+        goto out;
+
+    regs->eip    = ptss->eip;
+    regs->eflags = ptss->eflags | 2;
+    regs->eax    = ptss->eax;
+    regs->ecx    = ptss->ecx;
+    regs->edx    = ptss->edx;
+    regs->ebx    = ptss->ebx;
+    regs->esp    = ptss->esp;
+    regs->ebp    = ptss->ebp;
+    regs->esi    = ptss->esi;
+    regs->edi    = ptss->edi;
+
+    if ( (taskswitch_reason == TSW_call_or_int) )
+    {
+        regs->eflags |= X86_EFLAGS_NT;
+        ptss->back_link = prev_tr.sel;
+    }
+
+    exn_raised = 0;
+    if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
+         hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
+         hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
+         hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
+         hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
+         hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
+         hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
+        exn_raised = 1;
+
+    if ( (ptss->trace & 1) && !exn_raised )
+        hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
+
+    hvm_unmap(ptss);
+
+    tr.attr.fields.type = 0xb; /* busy 32-bit tss */
+    hvm_set_segment_register(v, x86_seg_tr, &tr);
+
+    hvm_stts(v);
+
+    if ( (taskswitch_reason == TSW_iret) ||
+         (taskswitch_reason == TSW_jmp) )
+        clear_bit(41, optss_desc); /* clear B flag of old task */
+
+    if ( taskswitch_reason != TSW_iret )
+        set_bit(41, nptss_desc); /* set B flag of new task */
+
+    if ( errcode >= 0 )
+    {
+        struct segment_register reg;
+        unsigned long linear_addr;
+        regs->esp -= 4;
+        hvm_get_segment_register(current, x86_seg_ss, &reg);
+        /* Todo: do not ignore access faults here. */
+        if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
+                                        4, hvm_access_write, 32,
+                                        &linear_addr) )
+            hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
+    }
+
+    hvm_load_cpu_guest_regs(v, regs);
+
+ out:
+    hvm_unmap(optss_desc);
+    hvm_unmap(nptss_desc);
+}
+
 /*
  * __hvm_copy():
  *  @buf  = hypervisor buffer
@@ -580,7 +1013,8 @@ static int __hvm_copy(void *buf, paddr_t
         
         mfn = get_mfn_from_gpfn(gfn);
 
-        if ( mfn == INVALID_MFN )
+        if ( (mfn == current->domain->arch.hvm_domain.vmx_apic_access_mfn) ||
+             (mfn == INVALID_MFN) )
             return todo;
 
         p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
@@ -663,6 +1097,9 @@ void hvm_cpuid(unsigned int input, unsig
 #endif
                 clear_bit(X86_FEATURE_PAE & 31, edx);
             clear_bit(X86_FEATURE_PSE36 & 31, edx);
+
+	    /* "Hypervisor present" bit required for Microsoft SVVP.  */
+	    set_bit (X86_FEATURE_HYPERVISOR & 31, ecx);
         }
         else if ( input == 0x80000001 )
         {
@@ -1106,6 +1543,11 @@ long do_hvm_op(unsigned long op, XEN_GUE
                 hvm_set_callback_via(d, a.value);
                 hvm_latch_shinfo_size(d);
                 break;
+            case HVM_PARAM_TIMER_MODE:
+                rc = -EINVAL;
+                if ( a.value > HVMPTM_one_missed_tick_pending )
+                    goto param_fail;
+                break;
             }
             d->arch.hvm_domain.params[a.index] = a.value;
             rc = 0;
@@ -1144,6 +1586,15 @@ long do_hvm_op(unsigned long op, XEN_GUE
         rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
         break;
 
+    case HVMOP_get_time: {
+        xen_hvm_get_time_t gxt;
+
+        gxt.now = NOW();
+        if ( copy_to_guest(arg, &gxt, 1) )
+            rc = -EFAULT;
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff -Naurp xen/arch/x86/hvm/i8254.c xen-redhat/arch/x86/hvm/i8254.c
--- xen/arch/x86/hvm/i8254.c
+++ xen-redhat/arch/x86/hvm/i8254.c
@@ -31,6 +31,7 @@
 #include <xen/lib.h>
 #include <xen/errno.h>
 #include <xen/sched.h>
+#include <asm/time.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/support.h>
@@ -41,7 +42,7 @@
 #define vcpu_vpit(vcpu)  (domain_vpit((vcpu)->domain))
 #define vpit_domain(pit) (container_of((pit), struct domain, \
                                        arch.hvm_domain.pl_time.vpit))
-#define vpit_vcpu(pit)   (vpit_domain(pit)->vcpu[0])
+#define vpit_vcpu(pit)   (pt_global_vcpu_target(vpit_domain(pit)))
 
 #define RW_STATE_LSB 1
 #define RW_STATE_MSB 2
@@ -51,6 +52,9 @@
 static int handle_pit_io(ioreq_t *p);
 static int handle_speaker_io(ioreq_t *p);
 
+#define get_guest_time(v) \
+   (is_hvm_vcpu(v) ? hvm_get_guest_time(v) : (u64)get_s_time())
+
 /* Compute with 96 bit intermediate result: (a*b)/c */
 static uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
@@ -84,8 +88,8 @@ static int pit_get_count(PITState *pit, 
 
     ASSERT(spin_is_locked(&pit->lock));
 
-    d = muldiv64(hvm_get_guest_time(v) - pit->count_load_time[channel],
-                 PIT_FREQ, ticks_per_sec(v));
+    d = muldiv64(get_guest_time(v) - pit->count_load_time[channel],
+                 PIT_FREQ, SYSTEM_TIME_HZ);
 
     switch ( c->mode )
     {
@@ -115,8 +119,8 @@ static int pit_get_out(PITState *pit, in
 
     ASSERT(spin_is_locked(&pit->lock));
 
-    d = muldiv64(hvm_get_guest_time(v) - pit->count_load_time[channel], 
-                 PIT_FREQ, ticks_per_sec(v));
+    d = muldiv64(get_guest_time(v) - pit->count_load_time[channel], 
+                 PIT_FREQ, SYSTEM_TIME_HZ);
 
     switch ( s->mode )
     {
@@ -162,7 +166,7 @@ static void pit_set_gate(PITState *pit, 
     case 3:
         /* Restart counting on rising edge. */
         if ( s->gate < val )
-            pit->count_load_time[channel] = hvm_get_guest_time(v);
+            pit->count_load_time[channel] = get_guest_time(v);
         break;
     }
 
@@ -178,7 +182,7 @@ int pit_get_gate(PITState *pit, int chan
 static void pit_time_fired(struct vcpu *v, void *priv)
 {
     uint64_t *count_load_time = priv;
-    *count_load_time = hvm_get_guest_time(v);
+    *count_load_time = get_guest_time(v);
 }
 
 static void pit_load_count(PITState *pit, int channel, int val)
@@ -194,11 +198,11 @@ static void pit_load_count(PITState *pit
         val = 0x10000;
 
     if ( v == NULL )
-        rdtscll(pit->count_load_time[channel]);
+        pit->count_load_time[channel] = 0;
     else
-        pit->count_load_time[channel] = hvm_get_guest_time(v);
+        pit->count_load_time[channel] = get_guest_time(v);
     s->count = val;
-    period = DIV_ROUND((val * 1000000000ULL), PIT_FREQ);
+    period = DIV_ROUND(val * SYSTEM_TIME_HZ, PIT_FREQ);
 
     if ( (v == NULL) || !is_hvm_vcpu(v) || (channel != 0) )
         return;
@@ -484,7 +488,7 @@ static int pit_load(struct domain *d, hv
     for ( i = 0; i < 3; i++ )
     {
         pit_load_count(pit, i, pit->hw.channels[i].count);
-        pit->pt[i].last_plt_gtime = hvm_get_guest_time(d->vcpu[0]);
+        pit->pt[i].last_plt_gtime = get_guest_time(d->vcpu[0]);
     }
 
     pit_info(pit);
@@ -517,6 +521,7 @@ void pit_init(struct vcpu *v, unsigned l
         s->mode = 0xff; /* the init mode */
         s->gate = (i != 2);
         pit_load_count(pit, i, 0);
+        pit->pt[i].source = PTSRC_isa;
     }
 
     spin_unlock(&pit->lock);
@@ -598,11 +603,13 @@ int pv_pit_handler(int port, int data, i
         .size = 1,
         .type = IOREQ_TYPE_PIO,
         .addr = port,
-        .dir  = write ? 0 : 1,
-        .data = write ? data : 0,
+        .dir  = write ? IOREQ_WRITE : IOREQ_READ,
+        .data = data
     };
 
-    if ( port == 0x61 )
+    if ( (current->domain->domain_id == 0) && dom0_pit_access(&ioreq) )
+        /* nothing to do */;
+    else if ( port == 0x61 )
         handle_speaker_io(&ioreq);
     else
         handle_pit_io(&ioreq);
diff -Naurp xen/arch/x86/hvm/intercept.c xen-redhat/arch/x86/hvm/intercept.c
--- xen/arch/x86/hvm/intercept.c
+++ xen-redhat/arch/x86/hvm/intercept.c
@@ -34,14 +34,16 @@
 extern struct hvm_mmio_handler hpet_mmio_handler;
 extern struct hvm_mmio_handler vlapic_mmio_handler;
 extern struct hvm_mmio_handler vioapic_mmio_handler;
+extern struct hvm_mmio_handler msixtbl_mmio_handler;
 
-#define HVM_MMIO_HANDLER_NR 3
+#define HVM_MMIO_HANDLER_NR 4
 
 static struct hvm_mmio_handler *hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] =
 {
     &hpet_mmio_handler,
     &vlapic_mmio_handler,
-    &vioapic_mmio_handler
+    &vioapic_mmio_handler,
+    &msixtbl_mmio_handler
 };
 
 struct hvm_buffered_io_range {
@@ -58,30 +60,33 @@ static struct hvm_buffered_io_range
     &buffered_stdvga_range
 };
 
-static inline void hvm_mmio_access(struct vcpu *v,
-                                   ioreq_t *p,
-                                   hvm_mmio_read_t read_handler,
-                                   hvm_mmio_write_t write_handler)
+static inline int hvm_mmio_access(struct vcpu *v,
+                                  ioreq_t *p,
+                                  hvm_mmio_read_t read_handler,
+                                  hvm_mmio_write_t write_handler)
 {
-    unsigned int tmp1, tmp2;
+    unsigned long tmp1, tmp2;
     unsigned long data;
+    int rc = 1;
 
     switch ( p->type ) {
     case IOREQ_TYPE_COPY:
     {
         if ( !p->data_is_ptr ) {
-            if ( p->dir == IOREQ_READ )
-                p->data = read_handler(v, p->addr, p->size);
+            if ( p->dir == IOREQ_READ ) {
+                rc = read_handler(v, p->addr, p->size, &data);
+                p->data = data;
+            }
             else    /* p->dir == IOREQ_WRITE */
-                write_handler(v, p->addr, p->size, p->data);
+                rc = write_handler(v, p->addr, p->size, p->data);
         } else {    /* p->data_is_ptr */
             int i, sign = (p->df) ? -1 : 1;
 
             if ( p->dir == IOREQ_READ ) {
                 for ( i = 0; i < p->count; i++ ) {
-                    data = read_handler(v,
+                    rc = read_handler(v,
                         p->addr + (sign * i * p->size),
-                        p->size);
+                        p->size, &data);
                     (void)hvm_copy_to_guest_phys(
                         p->data + (sign * i * p->size),
                         &data,
@@ -93,7 +98,7 @@ static inline void hvm_mmio_access(struc
                         &data,
                         p->data + (sign * i * p->size),
                         p->size);
-                    write_handler(v,
+                    rc = write_handler(v,
                         p->addr + (sign * i * p->size),
                         p->size, data);
                 }
@@ -103,37 +108,37 @@ static inline void hvm_mmio_access(struc
     }
 
     case IOREQ_TYPE_AND:
-        tmp1 = read_handler(v, p->addr, p->size);
-        if ( p->dir == IOREQ_WRITE ) {
+        rc = read_handler(v, p->addr, p->size, &tmp1);
+        if ( rc && p->dir == IOREQ_WRITE ) {
             tmp2 = tmp1 & (unsigned long) p->data;
-            write_handler(v, p->addr, p->size, tmp2);
+            rc = write_handler(v, p->addr, p->size, tmp2);
         }
         p->data = tmp1;
         break;
 
     case IOREQ_TYPE_ADD:
-        tmp1 = read_handler(v, p->addr, p->size);
-        if (p->dir == IOREQ_WRITE) {
+        rc = read_handler(v, p->addr, p->size, &tmp1);
+        if ( rc && p->dir == IOREQ_WRITE) {
             tmp2 = tmp1 + (unsigned long) p->data;
-            write_handler(v, p->addr, p->size, tmp2);
+            rc = write_handler(v, p->addr, p->size, tmp2);
         }
         p->data = tmp1;
         break;
 
     case IOREQ_TYPE_OR:
-        tmp1 = read_handler(v, p->addr, p->size);
-        if ( p->dir == IOREQ_WRITE ) {
+        rc = read_handler(v, p->addr, p->size, &tmp1);
+        if ( rc && p->dir == IOREQ_WRITE ) {
             tmp2 = tmp1 | (unsigned long) p->data;
-            write_handler(v, p->addr, p->size, tmp2);
+            rc = write_handler(v, p->addr, p->size, tmp2);
         }
         p->data = tmp1;
         break;
 
     case IOREQ_TYPE_XOR:
-        tmp1 = read_handler(v, p->addr, p->size);
-        if ( p->dir == IOREQ_WRITE ) {
+        rc = read_handler(v, p->addr, p->size, &tmp1);
+        if ( rc && p->dir == IOREQ_WRITE ) {
             tmp2 = tmp1 ^ (unsigned long) p->data;
-            write_handler(v, p->addr, p->size, tmp2);
+            rc = write_handler(v, p->addr, p->size, tmp2);
         }
         p->data = tmp1;
         break;
@@ -143,25 +148,29 @@ static inline void hvm_mmio_access(struc
          * Note that we don't need to be atomic here since VCPU is accessing
          * its own local APIC.
          */
-        tmp1 = read_handler(v, p->addr, p->size);
-        write_handler(v, p->addr, p->size, (unsigned long) p->data);
+        rc = read_handler(v, p->addr, p->size, &tmp1);
+        if ( rc )
+            rc = write_handler(v, p->addr, p->size, (unsigned long) p->data);
         p->data = tmp1;
         break;
 
     case IOREQ_TYPE_SUB:
-        tmp1 = read_handler(v, p->addr, p->size);
-        if ( p->dir == IOREQ_WRITE ) {
+        rc = read_handler(v, p->addr, p->size, &tmp1);
+        if ( rc && p->dir == IOREQ_WRITE ) {
             tmp2 = tmp1 - (unsigned long) p->data;
-            write_handler(v, p->addr, p->size, tmp2);
+            rc = write_handler(v, p->addr, p->size, tmp2);
         }
         p->data = tmp1;
         break;
 
     default:
+        rc = 0;
         printk("hvm_mmio_access: error ioreq type %x\n", p->type);
         domain_crash_synchronous();
         break;
     }
+
+    return rc;
 }
 
 int hvm_buffered_io_send(ioreq_t *p)
@@ -218,15 +227,11 @@ int hvm_mmio_intercept(ioreq_t *p)
     int i;
 
     for ( i = 0; i < HVM_MMIO_HANDLER_NR; i++ )
-    {
         if ( hvm_mmio_handlers[i]->check_handler(v, p->addr) )
-        {
-            hvm_mmio_access(v, p,
-                            hvm_mmio_handlers[i]->read_handler,
-                            hvm_mmio_handlers[i]->write_handler);
-            return 1;
-        }
-    }
+            return hvm_mmio_access(
+                v, p,
+                hvm_mmio_handlers[i]->read_handler,
+                hvm_mmio_handlers[i]->write_handler);
 
     return 0;
 }
@@ -243,6 +248,9 @@ int hvm_io_intercept(ioreq_t *p, int typ
     int i;
     unsigned long addr, size;
 
+    if ( (type == HVM_PORTIO) && (dpci_ioport_intercept(p)) )
+        return 1;
+
     for (i = 0; i < handler->num_slot; i++) {
         if( type != handler->hdl_list[i].type)
             continue;
diff -Naurp xen/arch/x86/hvm/io.c xen-redhat/arch/x86/hvm/io.c
--- xen/arch/x86/hvm/io.c
+++ xen-redhat/arch/x86/hvm/io.c
@@ -43,6 +43,7 @@
 
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
+#include <xen/iocap.h>
 
 #if defined (__i386__)
 static void set_reg_value (int size, int index, int seg, struct cpu_user_regs *regs, long value)
@@ -873,6 +874,108 @@ void hvm_io_assist(void)
     vcpu_end_shutdown_deferral(v);
 }
 
+void dpci_ioport_read(uint32_t mport, ioreq_t *p)
+{
+    int i, sign = p->df ? -1 : 1;
+    uint32_t data = 0;
+
+    for ( i = 0; i < p->count; i++ )
+    {
+        switch ( p->size )
+        {
+        case 1:
+            data = inb(mport);
+            break;
+        case 2:
+            data = inw(mport);
+            break;
+        case 4:
+            data = inl(mport);
+            break;
+        default:
+            BUG();
+        }
+
+        if ( p->data_is_ptr )
+            (void)hvm_copy_to_guest_phys(
+                p->data + (sign * i * p->size), &data, p->size);
+        else
+            p->data = data;
+    }
+}
+
+void dpci_ioport_write(uint32_t mport, ioreq_t *p)
+{
+    int i, sign = p->df ? -1 : 1;
+    uint32_t data;
+
+    for ( i = 0; i < p->count; i++ )
+    {
+        data = p->data;
+        if ( p->data_is_ptr )
+            (void)hvm_copy_from_guest_phys(
+                &data, p->data + (sign * i * p->size), p->size);
+
+        switch ( p->size )
+        {
+        case 1:
+            outb(data, mport);
+            break;
+        case 2:
+            outw(data, mport);
+            break;
+        case 4:
+            outl(data, mport);
+            break;
+        default:
+            BUG();
+        }
+    }
+}
+
+int dpci_ioport_intercept(ioreq_t *p)
+{
+    struct domain *d = current->domain;
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct g2m_ioport *g2m_ioport;
+    unsigned int mport, gport = p->addr;
+    unsigned int s = 0, e = 0;
+
+    list_for_each_entry( g2m_ioport, &hd->g2m_ioport_list, list )
+    {
+        s = g2m_ioport->gport;
+        e = s + g2m_ioport->np;
+        if ( (gport >= s) && (gport < e) )
+            goto found;
+    }
+
+    return 0;
+
+ found:
+    mport = (gport - s) + g2m_ioport->mport;
+
+    if ( !ioports_access_permitted(d, mport, mport + p->size - 1) )
+    {
+        gdprintk(XENLOG_ERR, "Error: access to gport=0x%x denied!\n",
+                 (uint32_t)p->addr);
+        return 0;
+    }
+
+    switch ( p->dir )
+    {
+    case IOREQ_READ:
+        dpci_ioport_read(mport, p);
+        break;
+    case IOREQ_WRITE:
+        dpci_ioport_write(mport, p);
+        break;
+    default:
+        gdprintk(XENLOG_ERR, "Error: couldn't handle p->dir = %d", p->dir);
+    }
+
+    return 1;
+}
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/x86/hvm/irq.c xen-redhat/arch/x86/hvm/irq.c
--- xen/arch/x86/hvm/irq.c
+++ xen-redhat/arch/x86/hvm/irq.c
@@ -125,17 +125,13 @@ void hvm_isa_irq_deassert(
     spin_unlock(&d->arch.hvm_domain.irq_lock);
 }
 
-void hvm_set_callback_irq_level(void)
+static void hvm_set_callback_irq_level(struct vcpu *v)
 {
-    struct vcpu *v = current;
     struct domain *d = v->domain;
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
     unsigned int gsi, pdev, pintx, asserted;
 
-    /* Fast lock-free tests. */
-    if ( (v->vcpu_id != 0) ||
-         (hvm_irq->callback_via_type == HVMIRQ_callback_none) )
-        return;
+    ASSERT(v->vcpu_id == 0);
 
     spin_lock(&d->arch.hvm_domain.irq_lock);
 
@@ -177,6 +173,22 @@ void hvm_set_callback_irq_level(void)
     spin_unlock(&d->arch.hvm_domain.irq_lock);
 }
 
+void hvm_maybe_deassert_evtchn_irq(void)
+{
+    struct domain *d = current->domain;
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+
+    if ( hvm_irq->callback_via_asserted &&
+         !vcpu_info(d->vcpu[0], evtchn_upcall_pending) )
+        hvm_set_callback_irq_level(d->vcpu[0]);
+}
+
+void hvm_assert_evtchn_irq(struct vcpu *v)
+{
+    if ( v->vcpu_id == 0 )
+        hvm_set_callback_irq_level(v);
+}
+
 void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq)
 {
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
@@ -285,71 +297,69 @@ void hvm_set_callback_via(struct domain 
     }
 }
 
-int cpu_has_pending_irq(struct vcpu *v)
+enum hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v)
 {
     struct hvm_domain *plat = &v->domain->arch.hvm_domain;
 
-    /* APIC */
+    if ( unlikely(v->arch.hvm_vcpu.nmi_pending) )
+        return hvm_intack_nmi;
+
     if ( vlapic_has_interrupt(v) != -1 )
-        return 1;
+        return hvm_intack_lapic;
 
-    /* PIC */
     if ( !vlapic_accept_pic_intr(v) )
-        return 0;
+        return hvm_intack_none;
 
-    return plat->vpic[0].int_output;
+    return plat->vpic[0].int_output ? hvm_intack_pic : hvm_intack_none;
 }
 
-int cpu_get_interrupt(struct vcpu *v, int *type)
+int hvm_vcpu_ack_pending_irq(struct vcpu *v, enum hvm_intack type, int *vector)
 {
-    int vector;
-
-    if ( (vector = cpu_get_apic_interrupt(v, type)) != -1 )
-        return vector;
-
-    if ( (v->vcpu_id == 0) &&
-         ((vector = cpu_get_pic_interrupt(v, type)) != -1) )
-        return vector;
+    switch ( type )
+    {
+    case hvm_intack_nmi:
+        return test_and_clear_bool(v->arch.hvm_vcpu.nmi_pending);
+    case hvm_intack_lapic:
+        return ((*vector = cpu_get_apic_interrupt(v)) != -1);
+    case hvm_intack_pic:
+        ASSERT(v->vcpu_id == 0);
+        return ((*vector = cpu_get_pic_interrupt(v)) != -1);
+    default:
+        break;
+    }
 
-    return -1;
+    return 0;
 }
 
-int get_isa_irq_vector(struct vcpu *v, int isa_irq, int type)
+int get_isa_irq_vector(struct vcpu *v, int isa_irq, enum hvm_intack src)
 {
     unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
 
-    if ( type == APIC_DM_EXTINT )
+    if ( src == hvm_intack_pic )
         return (v->domain->arch.hvm_domain.vpic[isa_irq >> 3].irq_base
                 + (isa_irq & 7));
 
+    ASSERT(src == hvm_intack_lapic);
     return domain_vioapic(v->domain)->redirtbl[gsi].fields.vector;
 }
 
 int is_isa_irq_masked(struct vcpu *v, int isa_irq)
 {
     unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
+    uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr;
 
-    if ( is_lvtt(v, isa_irq) )
-        return !is_lvtt_enabled(v);
-
-    return ((v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr &
-             (1 << (isa_irq & 7))) &&
+    return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) &&
             domain_vioapic(v->domain)->redirtbl[gsi].fields.mask);
 }
 
-/*
- * TODO: 1. Should not need special treatment of event-channel events.
- *       2. Should take notice of interrupt shadows (or clear them).
- */
 int hvm_local_events_need_delivery(struct vcpu *v)
 {
-    int pending;
-
-    pending = (vcpu_info(v, evtchn_upcall_pending) || cpu_has_pending_irq(v));
-    if ( unlikely(pending) )
-        pending = hvm_interrupts_enabled(v); 
+    enum hvm_intack type = hvm_vcpu_has_pending_irq(v);
+ 
+    if ( likely(type == hvm_intack_none) )
+        return 0;
 
-    return pending;
+    return hvm_interrupts_enabled(v, type);
 }
 
 #if 0 /* Keep for debugging */
@@ -388,9 +398,33 @@ static void irq_dump(struct domain *d)
 static int irq_save_pci(struct domain *d, hvm_domain_context_t *h)
 {
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+    unsigned int asserted, pdev, pintx;
+    int rc;
+
+    spin_lock(&d->arch.hvm_domain.irq_lock);
+
+    pdev  = hvm_irq->callback_via.pci.dev;
+    pintx = hvm_irq->callback_via.pci.intx;
+    asserted = (hvm_irq->callback_via_asserted &&
+                (hvm_irq->callback_via_type == HVMIRQ_callback_pci_intx));
+
+    /*
+     * Deassert virtual interrupt via PCI INTx line. The virtual interrupt
+     * status is not save/restored, so the INTx line must be deasserted in
+     * the restore context.
+     */
+    if ( asserted )
+        __hvm_pci_intx_deassert(d, pdev, pintx);
 
     /* Save PCI IRQ lines */
-    return ( hvm_save_entry(PCI_IRQ, 0, h, &hvm_irq->pci_intx) );
+    rc = hvm_save_entry(PCI_IRQ, 0, h, &hvm_irq->pci_intx);
+
+    if ( asserted )
+        __hvm_pci_intx_assert(d, pdev, pintx);    
+
+    spin_unlock(&d->arch.hvm_domain.irq_lock);
+
+    return rc;
 }
 
 static int irq_save_isa(struct domain *d, hvm_domain_context_t *h)
diff -Naurp xen/arch/x86/hvm/Makefile xen-redhat/arch/x86/hvm/Makefile
--- xen/arch/x86/hvm/Makefile
+++ xen-redhat/arch/x86/hvm/Makefile
@@ -16,3 +16,4 @@ obj-y += vioapic.o
 obj-y += vlapic.o
 obj-y += vpic.o
 obj-y += save.o
+obj-y += vmsi.o
diff -Naurp xen/arch/x86/hvm/platform.c xen-redhat/arch/x86/hvm/platform.c
--- xen/arch/x86/hvm/platform.c
+++ xen-redhat/arch/x86/hvm/platform.c
@@ -423,6 +423,17 @@ static int mmio_decode(int address_bytes
         GET_OP_SIZE_FOR_BYTE(size_reg);
         return reg_mem(size_reg, opcode, mmio_op, rex);
 
+    case 0x01: /* add r32/16, m32/16 */
+        mmio_op->instr = INSTR_ADD;
+        GET_OP_SIZE_FOR_NONEBYTE(*op_size);
+        return reg_mem(*op_size, opcode, mmio_op, rex);
+
+    case 0x02: /* add m8, r8 */
+        mmio_op->instr = INSTR_ADD;
+        *op_size = BYTE;
+        GET_OP_SIZE_FOR_BYTE(size_reg);
+        return mem_reg(size_reg, opcode, mmio_op, rex);
+
     case 0x03: /* add m32/16, r32/16 */
         mmio_op->instr = INSTR_ADD;
         GET_OP_SIZE_FOR_NONEBYTE(*op_size);
@@ -472,6 +483,23 @@ static int mmio_decode(int address_bytes
         GET_OP_SIZE_FOR_NONEBYTE(*op_size);
         return mem_reg(*op_size, opcode, mmio_op, rex);
 
+    case 0x28: /* sub r8, m8 */
+        mmio_op->instr = INSTR_SUB;
+        *op_size = BYTE;
+        GET_OP_SIZE_FOR_BYTE(size_reg);
+        return reg_mem(size_reg, opcode, mmio_op, rex);
+
+    case 0x29: /* sub r32/16, m32/16 */
+        mmio_op->instr = INSTR_SUB;
+        GET_OP_SIZE_FOR_NONEBYTE(*op_size);
+        return reg_mem(*op_size, opcode, mmio_op, rex);
+
+    case 0x2A: /* sub m8, r8 */
+        mmio_op->instr = INSTR_SUB;
+        *op_size = BYTE;
+        GET_OP_SIZE_FOR_BYTE(size_reg);
+        return mem_reg(size_reg, opcode, mmio_op, rex);
+
     case 0x2B: /* sub m32/16, r32/16 */
         mmio_op->instr = INSTR_SUB;
         GET_OP_SIZE_FOR_NONEBYTE(*op_size);
@@ -494,6 +522,11 @@ static int mmio_decode(int address_bytes
         GET_OP_SIZE_FOR_BYTE(size_reg);
         return mem_reg(size_reg, opcode, mmio_op, rex);
 
+    case 0x33: /* xor m16/32, r16/32 */
+        mmio_op->instr = INSTR_XOR;
+        GET_OP_SIZE_FOR_NONEBYTE(*op_size);
+        return mem_reg(*op_size, opcode, mmio_op, rex);
+
     case 0x38: /* cmp r8, m8 */
         mmio_op->instr = INSTR_CMP;
         *op_size = BYTE;
@@ -1057,7 +1090,9 @@ void handle_mmio(unsigned long gpa)
         for ( i = 0; i < inst_len; i++ )
             printk(" %02x", inst[i] & 0xFF);
         printk("\n");
-        domain_crash_synchronous();
+
+	hvm_inject_exception(TRAP_invalid_op, -1, 0);
+	return;
     }
 
     regs->eip += inst_len; /* advance %eip */
diff -Naurp xen/arch/x86/hvm/pmtimer.c xen-redhat/arch/x86/hvm/pmtimer.c
--- xen/arch/x86/hvm/pmtimer.c
+++ xen-redhat/arch/x86/hvm/pmtimer.c
@@ -65,14 +65,16 @@ static void pmt_update_sci(PMTState *s)
  * since the last time we did that. */
 static void pmt_update_time(PMTState *s)
 {
-    uint64_t curr_gtime;
+    uint64_t curr_gtime, tmp;
     uint32_t msb = s->pm.tmr_val & TMR_VAL_MSB;
     
     ASSERT(spin_is_locked(&s->lock));
 
     /* Update the timer */
     curr_gtime = hvm_get_guest_time(s->vcpu);
-    s->pm.tmr_val += ((curr_gtime - s->last_gtime) * s->scale) >> 32;
+    tmp = ((curr_gtime - s->last_gtime) * s->scale) + s->not_accounted;
+    s->not_accounted = (uint32_t)tmp;
+    s->pm.tmr_val += tmp >> 32;
     s->pm.tmr_val &= TMR_VAL_MASK;
     s->last_gtime = curr_gtime;
     
@@ -238,6 +240,7 @@ static int pmtimer_load(struct domain *d
 
     /* Calculate future counter values from now. */
     s->last_gtime = hvm_get_guest_time(s->vcpu);
+    s->not_accounted = 0;
 
     /* Set the SCI state from the registers */ 
     pmt_update_sci(s);
@@ -256,7 +259,8 @@ void pmtimer_init(struct vcpu *v)
 
     spin_lock_init(&s->lock);
 
-    s->scale = ((uint64_t)FREQUENCE_PMTIMER << 32) / ticks_per_sec(v);
+    s->scale = ((uint64_t)FREQUENCE_PMTIMER << 32) / SYSTEM_TIME_HZ;
+    s->not_accounted = 0;
     s->vcpu = v;
 
     /* Intercept port I/O (need two handlers because PM1a_CNT is between
diff -Naurp xen/arch/x86/hvm/rtc.c xen-redhat/arch/x86/hvm/rtc.c
--- xen/arch/x86/hvm/rtc.c
+++ xen-redhat/arch/x86/hvm/rtc.c
@@ -32,7 +32,7 @@
 #define vcpu_vrtc(vcpu)  (domain_vrtc((vcpu)->domain))
 #define vrtc_domain(rtc) (container_of((rtc), struct domain, \
                                        arch.hvm_domain.pl_time.vrtc))
-#define vrtc_vcpu(rtc)   (vrtc_domain(rtc)->vcpu[0])
+#define vrtc_vcpu(rtc)   (pt_global_vcpu_target(vrtc_domain(rtc)))
 
 static void rtc_periodic_cb(struct vcpu *v, void *opaque)
 {
@@ -42,14 +42,6 @@ static void rtc_periodic_cb(struct vcpu 
     spin_unlock(&s->lock);
 }
 
-int is_rtc_periodic_irq(void *opaque)
-{
-    RTCState *s = opaque;
-
-    return !(s->hw.cmos_data[RTC_REG_C] & RTC_AF || 
-             s->hw.cmos_data[RTC_REG_C] & RTC_UF);
-}
-
 /* Enable/configure/disable the periodic timer based on the RTC_PIE and
  * RTC_RATE_SELECT settings */
 static void rtc_timer_update(RTCState *s)
@@ -489,6 +481,8 @@ void rtc_init(struct vcpu *v, int base)
 
     spin_lock_init(&s->lock);
 
+    s->pt.source = PTSRC_isa;
+
     s->hw.cmos_data[RTC_REG_A] = RTC_REF_CLCK_32KHZ | 6; /* ~1kHz */
     s->hw.cmos_data[RTC_REG_B] = RTC_24H;
     s->hw.cmos_data[RTC_REG_C] = 0;
diff -Naurp xen/arch/x86/hvm/save.c xen-redhat/arch/x86/hvm/save.c
--- xen/arch/x86/hvm/save.c
+++ xen-redhat/arch/x86/hvm/save.c
@@ -23,6 +23,8 @@
 #include <xen/version.h>
 #include <public/version.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
+
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/domain.h>
@@ -74,6 +76,53 @@ size_t hvm_save_size(struct domain *d) 
     return sz;
 }
 
+/* Extract a single instance of a save record, by marshalling all
+ * records of that type and copying out the one we need. */
+int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance, 
+                 XEN_GUEST_HANDLE_64(uint8_t) handle)
+{
+    int rv = 0;
+    size_t sz = 0;
+    struct vcpu *v;
+    hvm_domain_context_t ctxt = { 0, };
+
+    if ( d->is_dying 
+         || typecode > HVM_SAVE_CODE_MAX 
+         || hvm_sr_handlers[typecode].size < sizeof(struct hvm_save_descriptor)
+         || hvm_sr_handlers[typecode].save == NULL )
+        return -EINVAL;
+
+    if ( hvm_sr_handlers[typecode].kind == HVMSR_PER_VCPU )
+        for_each_vcpu(d, v)
+            sz += hvm_sr_handlers[typecode].size;
+    else 
+        sz = hvm_sr_handlers[typecode].size;
+    
+    if ( (instance + 1) * hvm_sr_handlers[typecode].size > sz )
+        return -EINVAL;
+
+    ctxt.size = sz;
+    ctxt.data = xmalloc_bytes(sz);
+    if ( !ctxt.data )
+        return -ENOMEM;
+
+    if ( hvm_sr_handlers[typecode].save(d, &ctxt) != 0 )
+    {
+        gdprintk(XENLOG_ERR, 
+                 "HVM save: failed to save type %"PRIu16"\n", typecode);
+        rv = -EFAULT;
+    }
+    else if ( copy_to_guest(handle,
+                            ctxt.data 
+                            + (instance * hvm_sr_handlers[typecode].size) 
+                            + sizeof (struct hvm_save_descriptor), 
+                            hvm_sr_handlers[typecode].size
+                            - sizeof (struct hvm_save_descriptor)) )
+        rv = -EFAULT;
+
+    xfree(ctxt.data);
+    return rv;
+}
 
 int hvm_save(struct domain *d, hvm_domain_context_t *h)
 {
@@ -91,6 +140,9 @@ int hvm_save(struct domain *d, hvm_domai
     cpuid(1, &eax, &ebx, &ecx, &edx);
     hdr.cpuid = eax;
 
+    cpuid(0, &eax, &ebx, &ecx, &edx);
+    hdr.pad0 = ecx;
+
     /* Save xen changeset */
     c = strrchr(xen_changeset(), ':');
     if ( c )
@@ -98,8 +150,6 @@ int hvm_save(struct domain *d, hvm_domai
     else 
         hdr.changeset = -1ULL; /* Unknown */
 
-    hdr.pad0 = 0;
-
     if ( hvm_save_entry(HEADER, 0, h, &hdr) != 0 )
     {
         gdprintk(XENLOG_ERR, "HVM save: failed to write header\n");
@@ -161,6 +211,14 @@ int hvm_load(struct domain *d, hvm_domai
         return -1;
     }
 
+    cpuid(0, &eax, &ebx, &ecx, &edx);
+    if (hdr.pad0 != 0 && hdr.pad0 != ecx) {
+        gdprintk(XENLOG_ERR, 
+                 "HVM restore: unsupported cross-vendor migration (saved = "
+                 "%#"PRIx32", host = %#"PRIx32")\n", hdr.pad0, ecx);
+        return -1;
+    } 
+
     cpuid(1, &eax, &ebx, &ecx, &edx);
     /*TODO: need to define how big a difference is acceptable */
     if (hdr.cpuid != eax)
diff -Naurp xen/arch/x86/hvm/svm/asid.c xen-redhat/arch/x86/hvm/svm/asid.c
--- xen/arch/x86/hvm/svm/asid.c
+++ xen-redhat/arch/x86/hvm/svm/asid.c
@@ -78,26 +78,25 @@ static struct svm_asid_data *svm_asid_co
  */
 void svm_asid_init(struct cpuinfo_x86 *c)
 {
-    int nasids;
+    int nasids = 0;
     struct svm_asid_data *data = svm_asid_core_data();
 
-    /* Find #ASID. */
-    nasids = cpuid_ebx(0x8000000A);
-    data->max_asid = nasids - 1;
-
     /* Check if we can use ASIDs. */
     data->erratum170 =
-        !((c->x86 == 0x10) ||
+        !((c->x86 >= 0x10) ||
           ((c->x86 == 0xf) && (c->x86_model >= 0x68) && (c->x86_mask >= 1)));
 
-    printk("AMD SVM: ASIDs %s \n",
-           (data->erratum170 ? "disabled." : "enabled."));
+    if (!data->erratum170 )
+        nasids = cpuid_ebx(0x8000000A);
+
+    data->max_asid = nasids - 1;
+    printk("AMD SVM: ASIDS %s\n", (nasids ? "enabled." : "disabled."));
 
     /* Initialize ASID assigment. */
-    if ( data->erratum170 )
+    if ( nasids == 0 )
     {
-        /* On errata #170, VCPUs and phys processors should have same
-          generation.  We set both to invalid. */
+        /* In this case, VCPUs and phys processors should have same
+         *  generation.  We set both to invalid. */
         data->core_asid_generation = SVM_ASID_INVALID_GENERATION;
     }
     else
diff -Naurp xen/arch/x86/hvm/svm/emulate.c xen-redhat/arch/x86/hvm/svm/emulate.c
--- xen/arch/x86/hvm/svm/emulate.c
+++ xen-redhat/arch/x86/hvm/svm/emulate.c
@@ -412,13 +412,10 @@ static const u8 *opc_bytes[INSTR_MAX_COU
 /* 
  * Intel has a vmcs entry to give the instruction length. AMD doesn't.  So we
  * have to do a little bit of work to find out... 
- *
- * The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer
- * to enough bytes to satisfy the instruction including prefix bytes.
  */
 int __get_instruction_length_from_list(struct vcpu *v,
         enum instruction_index *list, unsigned int list_count, 
-        u8 *guest_eip_buf, enum instruction_index *match)
+        enum instruction_index *match)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned int inst_len = 0;
@@ -426,19 +423,13 @@ int __get_instruction_length_from_list(s
     unsigned int j;
     int found = 0;
     enum instruction_index instr = 0;
-    u8 buffer[MAX_INST_LEN];
-    u8 *buf;
+    int valid_inst_len;
+    u8 buf[MAX_INST_LEN];
     const u8 *opcode = NULL;
 
-    if (guest_eip_buf)
-    {
-        buf = guest_eip_buf;
-    }
-    else
-    {
-        inst_copy_from_guest(buffer, svm_rip2pointer(v), MAX_INST_LEN);
-        buf = buffer;
-    }
+    /* hvm_copy_from_guest_virt returns the number of *unread* bytes.  */
+    valid_inst_len = MAX_INST_LEN -
+	    hvm_copy_from_guest_virt(buf, svm_rip2pointer(v), MAX_INST_LEN);
 
     for (j = 0; j < list_count; j++)
     {
@@ -446,14 +437,16 @@ int __get_instruction_length_from_list(s
         opcode = opc_bytes[instr];
         ASSERT(opcode);
 
-        while (inst_len < MAX_INST_LEN && 
+        while (inst_len < valid_inst_len && 
                 is_prefix(buf[inst_len]) && 
                 !is_prefix(opcode[1]))
             inst_len++;
 
         ASSERT(opcode[0] <= 15);    /* Make sure the table is correct. */
-        found = 1;
+	if (inst_len + opcode[0] > valid_inst_len)
+            continue;
 
+        found = 1;
         for (i = 0; i < opcode[0]; i++)
         {
             /* If the last byte is zero, we just accept it without checking */
@@ -476,7 +469,7 @@ int __get_instruction_length_from_list(s
     {
         inst_len += opcode[0];
 
-        ASSERT(inst_len <= MAX_INST_LEN);
+        ASSERT(inst_len <= valid_inst_len);
 
         if (match)
             *match = instr;
@@ -484,8 +477,9 @@ int __get_instruction_length_from_list(s
         return inst_len;
     }
 
-    printk("%s: Mismatch between expected and actual instruction bytes: "
-            "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
+    gdprintk(XENLOG_WARNING,
+             "%s: Mismatch between expected and actual instruction bytes: "
+             "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     return 0;
 }
 
diff -Naurp xen/arch/x86/hvm/svm/intr.c xen-redhat/arch/x86/hvm/svm/intr.c
--- xen/arch/x86/hvm/svm/intr.c
+++ xen-redhat/arch/x86/hvm/svm/intr.c
@@ -31,6 +31,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/support.h>
+#include <asm/hvm/vlapic.h>
 #include <asm/hvm/svm/svm.h>
 #include <asm/hvm/svm/intr.h>
 #include <xen/event.h>
@@ -39,100 +40,144 @@
 #include <xen/domain_page.h>
 #include <asm/hvm/trace.h>
 
-/*
- * Most of this code is copied from vmx_io.c and modified 
- * to be suitable for SVM.
- */
-
-static inline int svm_inject_extint(struct vcpu *v, int trap)
+static void svm_inject_dummy_vintr(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     vintr_t intr = vmcb->vintr;
 
-    /* Update only relevant fields */    
     intr.fields.irq = 1;
     intr.fields.intr_masking = 1;
-    intr.fields.vector = trap;
+    intr.fields.vector = 0;
     intr.fields.prio = 0xF;
     intr.fields.ign_tpr = 1;
     vmcb->vintr = intr;
-
-    return 0;
 }
     
-asmlinkage void svm_intr_assist(void) 
+static void svm_inject_nmi(struct vcpu *v)
 {
-    struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    int intr_type = APIC_DM_EXTINT;
-    int intr_vector = -1;
+    eventinj_t event;
 
-    /*
-     * Previous Interrupt delivery caused this intercept?
-     * This will happen if the injection is latched by the processor (hence
-     * clearing vintr.fields.irq) but then subsequently a fault occurs (e.g.,
-     * due to lack of shadow mapping of guest IDT or guest-kernel stack).
-     * 
-     * NB. Exceptions that fault during delivery are lost. This needs to be
-     * fixed but we'll usually get away with it since faults are usually
-     * idempotent. But this isn't the case for e.g. software interrupts!
-     */
-    if ( vmcb->exitintinfo.fields.v && (vmcb->exitintinfo.fields.type == 0) )
-    {
-        intr_vector = vmcb->exitintinfo.fields.vector;
-        vmcb->exitintinfo.bytes = 0;
-        HVMTRACE_1D(REINJ_VIRQ, v, intr_vector);
-        svm_inject_extint(v, intr_vector);
+    event.bytes = 0;
+    event.fields.v = 1;
+    event.fields.type = EVENTTYPE_NMI;
+    event.fields.vector = 2;
+
+    ASSERT(vmcb->eventinj.fields.v == 0);
+    vmcb->eventinj = event;
+}
+
+static void svm_inject_extint(struct vcpu *v, int vector)
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    eventinj_t event;
+
+    event.bytes = 0;
+    event.fields.v = 1;
+    event.fields.type = EVENTTYPE_INTR;
+    event.fields.vector = vector;
+
+    ASSERT(vmcb->eventinj.fields.v == 0);
+    vmcb->eventinj = event;
+}
+
+static void update_cr8_intercept(
+    struct vcpu *v)
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    struct vlapic *vlapic = vcpu_vlapic(v);
+    int max_irr;
+
+    vmcb->cr_intercepts &= ~CR_INTERCEPT_CR8_WRITE;
+ 
+    /* Is there an interrupt pending at the LAPIC? Nothing to do if not. */
+    if ( !vlapic_enabled(vlapic) || 
+         ((max_irr = vlapic_find_highest_irr(vlapic)) == -1) )
         return;
-    }
+
+    /* Highest-priority pending interrupt is masked by the TPR? */
+    if ( (vmcb->vintr.fields.tpr & 0xf) >= (max_irr >> 4) )
+        vmcb->cr_intercepts |= CR_INTERCEPT_CR8_WRITE;
+}
+    
+static void enable_intr_window(struct vcpu *v, enum hvm_intack intr_source)
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+    ASSERT(intr_source != hvm_intack_none);
 
     /*
-     * Previous interrupt still pending? This occurs if we return from VMRUN
-     * very early in the entry-to-guest process. Usually this is because an
-     * external physical interrupt was pending when we executed VMRUN.
+     * Create a dummy virtual interrupt to intercept as soon as the
+     * guest can accept the real interrupt.
+     *
+     * TODO: Better NMI handling. We need a way to skip a MOV SS interrupt
+     * shadow. This is hard to do without hardware support. We should also
+     * track 'NMI blocking' from NMI injection until IRET. This can be done
+     * quite easily in software by intercepting the unblocking IRET.
      */
-    if ( vmcb->vintr.fields.irq )
-        return;
+    vmcb->general1_intercepts |= GENERAL1_INTERCEPT_VINTR;
+    HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1);
+    svm_inject_dummy_vintr(v);
+}
+
+asmlinkage void svm_intr_assist(void) 
+{
+    struct vcpu *v = current;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    enum hvm_intack intr_source;
+    int intr_vector;
 
     /* Crank the handle on interrupt state and check for new interrrupts. */
     pt_update_irq(v);
-    hvm_set_callback_irq_level();
-    if ( !cpu_has_pending_irq(v) )
-        return;
+    hvm_dirq_assist(v);
 
-    /*
-     * If the guest can't take an interrupt right now, create a 'fake'
-     * virtual interrupt on to intercept as soon as the guest _can_ take
-     * interrupts.  Do not obtain the next interrupt from the vlapic/pic
-     * if unable to inject.
-     *
-     * Also do this if there is an exception pending.  This is because
-     * the delivery of the exception can arbitrarily delay the injection
-     * of the vintr (for example, if the exception is handled via an
-     * interrupt gate, hence zeroing RFLAGS.IF). In the meantime:
-     * - the vTPR could be modified upwards, so we need to wait until the
-     *   exception is delivered before we can safely decide that an
-     *   interrupt is deliverable; and
-     * - the guest might look at the APIC/PIC state, so we ought not to have 
-     *   cleared the interrupt out of the IRR.
-     */
-    if ( irq_masked(vmcb->rflags) || vmcb->interrupt_shadow 
-         || vmcb->eventinj.fields.v )  
+    do {
+        intr_source = hvm_vcpu_has_pending_irq(v);
+        if ( likely(intr_source == hvm_intack_none) )
+            goto out;
+
+        /*
+         * Pending IRQs must be delayed if:
+         * 1. An event is already pending. This is despite the fact that SVM
+         *    provides a VINTR delivery method quite separate from the EVENTINJ
+         *    mechanism. The event delivery can arbitrarily delay the injection
+         *    of the vintr (for example, if the exception is handled via an
+         *    interrupt gate, hence zeroing RFLAGS.IF). In the meantime:
+         *    - the vTPR could be modified upwards, so we need to wait until
+         *      the exception is delivered before we can safely decide that an
+         *      interrupt is deliverable; and
+         *    - the guest might look at the APIC/PIC state, so we ought not to
+         *      have cleared the interrupt out of the IRR.
+         * 2. The IRQ is masked.
+         */
+        if ( unlikely(vmcb->eventinj.fields.v) ||
+             !hvm_interrupts_enabled(v, intr_source) )
+        {
+            enable_intr_window(v, intr_source);
+            return;
+        }
+    } while ( !hvm_vcpu_ack_pending_irq(v, intr_source, &intr_vector) );
+
+    if ( intr_source == hvm_intack_nmi )
     {
-        vmcb->general1_intercepts |= GENERAL1_INTERCEPT_VINTR;
-        HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1);
-        svm_inject_extint(v, 0x0); /* actual vector doesn't matter */
-        return;
+        svm_inject_nmi(v);
+    }
+    else
+    {
+        HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0);
+        svm_inject_extint(v, intr_vector);
+        pt_intr_post(v, intr_vector, intr_source);
     }
 
-    /* Okay, we can deliver the interrupt: grab it and update PIC state. */
-    intr_vector = cpu_get_interrupt(v, &intr_type);
-    BUG_ON(intr_vector < 0);
-
-    HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0);
-    svm_inject_extint(v, intr_vector);
+    /* Is there another IRQ to queue up behind this one? */
+    intr_source = hvm_vcpu_has_pending_irq(v);
+    if ( unlikely(intr_source != hvm_intack_none) ) {
+        enable_intr_window(v, intr_source);
+        return;
+    }
 
-    pt_intr_post(v, intr_vector, intr_type);
+ out:
+    update_cr8_intercept(v);
 }
 
 /*
diff -Naurp xen/arch/x86/hvm/svm/svm.c xen-redhat/arch/x86/hvm/svm/svm.c
--- xen/arch/x86/hvm/svm/svm.c
+++ xen-redhat/arch/x86/hvm/svm/svm.c
@@ -49,12 +49,11 @@
 #include <asm/hvm/vpt.h>
 #include <asm/hvm/trace.h>
 #include <asm/hap.h>
+#include <asm/debugger.h>
 
 #define set_segment_register(name, value)  \
     asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
 
-int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip,
-                         int inst_len);
 asmlinkage void do_IRQ(struct cpu_user_regs *);
 
 static int svm_reset_to_realmode(struct vcpu *v,
@@ -66,9 +65,6 @@ static void *hsa[NR_CPUS] __read_mostly;
 /* vmcb used for extended host state */
 static void *root_vmcb[NR_CPUS] __read_mostly;
 
-/* hardware assisted paging bits */
-extern int opt_hap_enabled;
-
 static void svm_inject_exception(struct vcpu *v, int trap, 
                                         int ev, int error_code)
 {
@@ -87,8 +83,6 @@ static void svm_inject_exception(struct 
     event.fields.ev = ev;
     event.fields.errorcode = error_code;
 
-    ASSERT(vmcb->eventinj.fields.v == 0);
-    
     vmcb->eventinj = event;
 }
 
@@ -374,40 +368,15 @@ int svm_vmcb_save(struct vcpu *v, struct
     c->sysenter_esp = vmcb->sysenter_esp;
     c->sysenter_eip = vmcb->sysenter_eip;
 
-    /* Save any event/interrupt that was being injected when we last
-     * exited.  Although there are three(!) VMCB fields that can contain
-     * active events, we only need to save at most one: because the
-     * intr_assist logic never delivers an IRQ when any other event is
-     * active, we know that the only possible collision is if we inject
-     * a fault while exitintinfo contains a valid event (the delivery of
-     * which caused the last exit).  In that case replaying just the
-     * first event should cause the same behaviour when we restore. */
-    if ( vmcb->vintr.fields.irq 
-         && /* Check it's not a fake interrupt (see svm_intr_assist()) */
-         !(vmcb->general1_intercepts & GENERAL1_INTERCEPT_VINTR) )
-    {
-        c->pending_vector = vmcb->vintr.fields.vector;
-        c->pending_type = 0; /* External interrupt */
-        c->pending_error_valid = 0;
-        c->pending_reserved = 0;
-        c->pending_valid = 1;
-        c->error_code = 0;
-    }
-    else if ( vmcb->exitintinfo.fields.v )
+    c->pending_event = 0;
+    c->error_code = 0;
+    if ( vmcb->eventinj.fields.v &&
+         svm_event_needs_reinjection(vmcb->eventinj.fields.type,
+                                     vmcb->eventinj.fields.vector) )
     {
-        c->pending_event = vmcb->exitintinfo.bytes & 0xffffffff;
-        c->error_code = vmcb->exitintinfo.fields.errorcode;
-    }
-    else if ( vmcb->eventinj.fields.v ) 
-    {
-        c->pending_event = vmcb->eventinj.bytes & 0xffffffff;
+        c->pending_event = (uint32_t)vmcb->eventinj.bytes;
         c->error_code = vmcb->eventinj.fields.errorcode;
     }
-    else 
-    {
-        c->pending_event = 0;
-        c->error_code = 0;
-    }
 
     return 1;
 }
@@ -541,26 +510,23 @@ int svm_vmcb_restore(struct vcpu *v, str
         gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
                  c->pending_event, c->error_code);
 
-        /* VMX uses a different type for #OF and #BP; fold into "Exception"  */
-        if ( c->pending_type == 6 ) 
-            c->pending_type = 3;
-        /* Sanity check */
-        if ( c->pending_type == 1 || c->pending_type > 4 
-             || c->pending_reserved != 0 )
+        if ( (c->pending_type == 1) || (c->pending_type > 6) ||
+             (c->pending_reserved != 0) )
         {
             gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n", 
                      c->pending_event);
             return -EINVAL;
         }
-        /* Put this pending event in exitintinfo and svm_intr_assist()
-         * will reinject it when we return to the guest. */
-        vmcb->exitintinfo.bytes = c->pending_event;
-        vmcb->exitintinfo.fields.errorcode = c->error_code;
+
+        if ( svm_event_needs_reinjection(c->pending_type, c->pending_vector) )
+        {
+            vmcb->eventinj.bytes = c->pending_event;
+            vmcb->eventinj.fields.errorcode = c->error_code;
+        }
     }
 
     paging_update_paging_modes(v);
-    /* signal paging update to ASID handler */
-    svm_asid_g_update_paging (v);
+    svm_asid_g_update_paging(v);
 
     return 0;
  
@@ -582,7 +548,7 @@ static void svm_save_cpu_state(struct vc
     data->msr_efer         = v->arch.hvm_svm.cpu_shadow_efer;
     data->msr_flags        = -1ULL;
 
-    data->tsc = hvm_get_guest_time(v);
+    data->tsc = hvm_get_guest_tsc(v);
 }
 
 
@@ -602,7 +568,7 @@ static void svm_load_cpu_state(struct vc
     if ( !(vmcb->efer & EFER_LMA) )
         vmcb->efer &= ~EFER_LME;
 
-    hvm_set_guest_time(v, data->tsc);
+    hvm_set_guest_tsc(v, data->tsc);
 }
 
 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
@@ -623,10 +589,15 @@ static int svm_load_vmcb_ctxt(struct vcp
     return 0;
 }
 
-static int svm_interrupts_enabled(struct vcpu *v)
+static int svm_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
 {
-    unsigned long eflags = v->arch.hvm_svm.vmcb->rflags;
-    return !irq_masked(eflags); 
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+    if ( type == hvm_intack_nmi )
+        return !vmcb->interrupt_shadow;
+
+    ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
+    return (vmcb->rflags & X86_EFLAGS_IF) && !vmcb->interrupt_shadow;
 }
 
 static int svm_guest_x86_mode(struct vcpu *v)
@@ -694,9 +665,7 @@ static void svm_sync_vmcb(struct vcpu *v
 
     arch_svm->vmcb_in_sync = 1;
 
-    asm volatile (
-        ".byte 0x0f,0x01,0xdb" /* vmsave */
-        : : "a" (__pa(arch_svm->vmcb)) );
+    svm_vmsave(arch_svm->vmcb);
 }
 
 static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg)
@@ -725,6 +694,9 @@ static void svm_get_segment_register(str
                                      struct segment_register *reg)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+    ASSERT(v == current);
+
     switch ( seg )
     {
     case x86_seg_cs:
@@ -761,8 +733,124 @@ static void svm_get_segment_register(str
         svm_sync_vmcb(v);
         memcpy(reg, &vmcb->ldtr, sizeof(*reg));
         break;
-    default: BUG();
+    default:
+        BUG();
+    }
+}
+
+static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
+                                     struct segment_register *reg)
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+    ASSERT(v == current);
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        memcpy(&vmcb->cs, reg, sizeof(*reg));
+        guest_cpu_user_regs()->cs = reg->sel;
+        break;
+    case x86_seg_ds:
+        memcpy(&vmcb->ds, reg, sizeof(*reg));
+        break;
+    case x86_seg_es:
+        memcpy(&vmcb->es, reg, sizeof(*reg));
+        break;
+    case x86_seg_fs:
+        svm_sync_vmcb(v);
+        memcpy(&vmcb->fs, reg, sizeof(*reg));
+        svm_vmload(vmcb);
+        break;
+    case x86_seg_gs:
+        svm_sync_vmcb(v);
+        memcpy(&vmcb->gs, reg, sizeof(*reg));
+        svm_vmload(vmcb);
+        break;
+    case x86_seg_ss:
+        memcpy(&vmcb->ss, reg, sizeof(*reg));
+        guest_cpu_user_regs()->ss = reg->sel;
+        break;
+    case x86_seg_tr:
+        svm_sync_vmcb(v);
+        memcpy(&vmcb->tr, reg, sizeof(*reg));
+        svm_vmload(vmcb);
+        break;
+    case x86_seg_gdtr:
+        memcpy(&vmcb->gdtr, reg, sizeof(*reg));
+        break;
+    case x86_seg_idtr:
+        memcpy(&vmcb->idtr, reg, sizeof(*reg));
+        break;
+    case x86_seg_ldtr:
+        svm_sync_vmcb(v);
+        memcpy(&vmcb->ldtr, reg, sizeof(*reg));
+        svm_vmload(vmcb);
+        break;
+    default:
+        BUG();
+    }
+}
+
+static int svm_set_cr3(unsigned long value)
+{
+    struct vcpu *v = current;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned long old_base_mfn, mfn;
+
+    if ( paging_mode_hap(v->domain) )
+    {
+        vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value;
+        return X86EMUL_OKAY;
+    }
+
+    /* If paging is not enabled yet, simply copy the value to CR3. */
+    if ( !svm_paging_enabled(v) )
+    {
+        v->arch.hvm_svm.cpu_cr3 = value;
+        return X86EMUL_OKAY;
+    }
+
+    /* We make a new one if the shadow does not exist. */
+    if ( value == v->arch.hvm_svm.cpu_cr3 )
+    {
+        /* 
+         * This is simple TLB flush, implying the guest has 
+         * removed some translation or changed page attributes.
+         * We simply invalidate the shadow.
+         */
+        mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
+        if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
+            return X86EMUL_UNHANDLEABLE;
+        paging_update_cr3(v);
+        /* signal paging update to ASID handler */
+        svm_asid_g_mov_to_cr3 (v);
     }
+    else 
+    {
+        /*
+         * If different, make a shadow. Check if the PDBR is valid
+         * first.
+         */
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
+        mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
+        if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
+            return X86EMUL_UNHANDLEABLE;
+
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+
+        if ( old_base_mfn )
+            put_page(mfn_to_page(old_base_mfn));
+
+        v->arch.hvm_svm.cpu_cr3 = value;
+        update_cr3(v);
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
+        /* signal paging update to ASID handler */
+        svm_asid_g_mov_to_cr3 (v);
+    }
+
+    return X86EMUL_OKAY;
 }
 
 /* Make sure that xen intercepts any FP accesses from current */
@@ -863,10 +951,7 @@ static void svm_ctxt_switch_from(struct 
     svm_save_dr(v);
 
     svm_sync_vmcb(v);
-
-    asm volatile (
-        ".byte 0x0f,0x01,0xda" /* vmload */
-        : : "a" (__pa(root_vmcb[cpu])) );
+    svm_vmload(root_vmcb[cpu]);
 
 #ifdef __x86_64__
     /* Resume use of ISTs now that the host TR is reinstated. */
@@ -902,12 +987,8 @@ static void svm_ctxt_switch_to(struct vc
 
     svm_restore_dr(v);
 
-    asm volatile (
-        ".byte 0x0f,0x01,0xdb" /* vmsave */
-        : : "a" (__pa(root_vmcb[cpu])) );
-    asm volatile (
-        ".byte 0x0f,0x01,0xda" /* vmload */
-        : : "a" (__pa(v->arch.hvm_svm.vmcb)) );
+    svm_vmsave(root_vmcb[cpu]);
+    svm_vmload(v->arch.hvm_svm.vmcb);
 }
 
 static void svm_do_resume(struct vcpu *v) 
@@ -972,10 +1053,10 @@ static void svm_hvm_inject_exception(
     svm_inject_exception(v, trapnr, (errcode != -1), errcode);
 }
 
-static int svm_event_injection_faulted(struct vcpu *v)
+static int svm_event_pending(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    return vmcb->exitintinfo.fields.v;
+    return vmcb->eventinj.fields.v;
 }
 
 static struct hvm_function_table svm_function_table = {
@@ -996,8 +1077,10 @@ static struct hvm_function_table svm_fun
     .get_guest_ctrl_reg   = svm_get_ctrl_reg,
     .get_segment_base     = svm_get_segment_base,
     .get_segment_register = svm_get_segment_register,
+    .set_segment_register = svm_set_segment_register,
     .update_host_cr3      = svm_update_host_cr3,
     .update_guest_cr3     = svm_update_guest_cr3,
+    .set_cr3              = svm_set_cr3,
     .flush_guest_tlbs     = svm_flush_guest_tlbs,
     .update_vtpr          = svm_update_vtpr,
     .stts                 = svm_stts,
@@ -1005,23 +1088,9 @@ static struct hvm_function_table svm_fun
     .inject_exception     = svm_hvm_inject_exception,
     .init_ap_context      = svm_init_ap_context,
     .init_hypercall_page  = svm_init_hypercall_page,
-    .event_injection_faulted = svm_event_injection_faulted
+    .event_pending        = svm_event_pending
 };
 
-static void svm_npt_detect(void)
-{
-    u32 eax, ebx, ecx, edx;
-
-    /* Check CPUID for nested paging support. */
-    cpuid(0x8000000A, &eax, &ebx, &ecx, &edx);
-
-    if ( !(edx & 1) && opt_hap_enabled )
-    {
-        printk("SVM: Nested paging is not supported by this CPU.\n");
-        opt_hap_enabled = 0;
-    }
-}
-
 int start_svm(struct cpuinfo_x86 *c)
 {
     u32 eax, ecx, edx;
@@ -1033,7 +1102,7 @@ int start_svm(struct cpuinfo_x86 *c)
     ecx = cpuid_ecx(0x80000001);
     boot_cpu_data.x86_capability[5] = ecx;
     
-    if ( !(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)) )
+    if ( !(test_bit(X86_FEATURE_SVM, &boot_cpu_data.x86_capability)) )
         return 0;
 
     /* Check whether SVM feature is disabled in BIOS */
@@ -1050,8 +1119,6 @@ int start_svm(struct cpuinfo_x86 *c)
 
     write_efer(read_efer() | EFER_SVME);
 
-    svm_npt_detect();
-
     /* Initialize the HSA for this core. */
     phys_hsa = (u64) virt_to_maddr(hsa[cpu]);
     phys_hsa_lo = (u32) phys_hsa;
@@ -1065,12 +1132,12 @@ int start_svm(struct cpuinfo_x86 *c)
         return 1;
 
     setup_vmcb_dump();
+    svm_function_table.hap_supported = (cpuid_edx(0x8000000A) & 1);
+    svm_function_table.hap_1gb_pgtb =
+	(CONFIG_PAGING_LEVELS == 4) ? (cpuid_edx(0x80000001) & 0x04000000) : 0;
 
     hvm_enable(&svm_function_table);
 
-    if ( opt_hap_enabled )
-        printk("SVM: Nested paging enabled.\n");
-        
     return 1;
 }
 
@@ -1096,8 +1163,8 @@ static void svm_do_no_device_fault(struc
         vmcb->cr0 &= ~X86_CR0_TS;
 }
 
-/* Reserved bits ECX: [31:14], [12:4], [2:1]*/
-#define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6
+/* Reserved bits ECX: [30:29], [24], [18:14], [11:10], [8:4], [2] */
+#define SVM_VCPU_CPUID_L1_ECX_RESERVED 0x6107cdf4
 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */
 #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400
 
@@ -1109,6 +1176,9 @@ static void svm_vmexit_do_cpuid(struct v
     struct vcpu *v = current;
     int inst_len;
 
+    if ( (inst_len = __get_instruction_length(v, INSTR_CPUID)) == 0 )
+        return;
+
     hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
 
     if ( input == 0x00000001 )
@@ -1117,6 +1187,22 @@ static void svm_vmexit_do_cpuid(struct v
         ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
         edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
 
+	/* Clear FMA instruction support. */
+	clear_bit(X86_FEATURE_FMA & 31, &ecx);
+
+	/* Clear x2APIC capability. */
+	clear_bit(X86_FEATURE_X2APIC & 31, &ecx);
+
+	/* Clear MOVBE instruction support */
+	clear_bit(X86_FEATURE_MOVBE & 31, &ecx);
+
+        /* Clear XSAVE and OSXSAVE bits. */
+        clear_bit(X86_FEATURE_XSAVE & 31, &ecx);
+        clear_bit(X86_FEATURE_OSXSAVE & 31, &ecx);
+
+	/* Clear AVX instruction support. */
+	clear_bit(X86_FEATURE_AVX & 31, &ecx);
+
         /* Guest should only see one logical processor.
          * See details on page 23 of AMD CPUID Specification.
          */
@@ -1129,12 +1215,15 @@ static void svm_vmexit_do_cpuid(struct v
         if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
             clear_bit(X86_FEATURE_APIC & 31, &edx);
 
+        clear_bit(X86_FEATURE_EXTAPIC & 31, &ecx);
+
 #if CONFIG_PAGING_LEVELS >= 3
         if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
 #endif
             clear_bit(X86_FEATURE_PAE & 31, &edx);
 
         clear_bit(X86_FEATURE_PSE36 & 31, &edx);
+        clear_bit(X86_FEATURE_PAGE1GB & 31, &edx);
 
         /* Clear the Cmp_Legacy bit
          * This bit is supposed to be zero when HTT = 0.
@@ -1143,17 +1232,49 @@ static void svm_vmexit_do_cpuid(struct v
         clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx);
 
         /* Make SVM feature invisible to the guest. */
-        clear_bit(X86_FEATURE_SVME & 31, &ecx);
+        clear_bit(X86_FEATURE_SVM & 31, &ecx);
 
         /* So far, we do not support 3DNow for the guest. */
         clear_bit(X86_FEATURE_3DNOW & 31, &edx);
         clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx);
+
         /* no FFXSR instructions feature. */
         clear_bit(X86_FEATURE_FFXSR & 31, &edx);
+
+        /* no RDTSCP instruction support */
+        clear_bit(X86_FEATURE_RDTSCP & 31, &edx);
+
+        /* no topology extensions */
+        clear_bit(X86_FEATURE_NODEID_MSR & 31, &ecx);
+        clear_bit(X86_FEATURE_TOPOEXT & 31, &ecx);
+
+        /* no OS Visible Workaround support */
+        clear_bit(X86_FEATURE_OSVW & 31, &ecx);
+
+        /* no Instruction Based Sampling */
+        clear_bit(X86_FEATURE_IBS & 31, &ecx);
+
+        /* no SKINIT and STGI support */
+        clear_bit(X86_FEATURE_SKINIT & 31, &ecx);
+
+        /* no Watchdog Timer */
+        clear_bit(X86_FEATURE_WDT & 31, &ecx);
+
+        /* no Lightweight Profiling support */
+        clear_bit(X86_FEATURE_LWP & 31, &ecx);
+
+        /* no Performance Counter Extensions */
+        clear_bit(X86_FEATURE_PERFCTR_CORE & 31, &ecx);
+        clear_bit(X86_FEATURE_PERFCTR_NB & 31, &ecx);
     }
-    else if ( input == 0x80000007 || input == 0x8000000A )
+    else if ( input == 0x80000007 || input == 0x8000000A ||
+              input == 0x8000001B || input == 0x8000001C ||
+              input == 0x8000001E )
     {
-        /* Mask out features of power management and SVM extension. */
+        /* Mask out features of power management, SVM extension,
+         * Instruction Based Sampling, Lightweight Profiling, and
+         * extended topology reporting
+         */
         eax = ebx = ecx = edx = 0;
     }
     else if ( input == 0x80000008 )
@@ -1170,8 +1291,6 @@ static void svm_vmexit_do_cpuid(struct v
     HVMTRACE_3D(CPUID, v, input,
                 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
 
-    inst_len = __get_instruction_length(v, INSTR_CPUID, NULL);
-    ASSERT(inst_len > 0);
     __update_guest_eip(vmcb, inst_len);
 }
 
@@ -1272,18 +1391,13 @@ static void svm_get_prefix_info(struct v
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned char inst[MAX_INST_LEN];
+    int valid_inst_len;
     int i;
 
-    memset(inst, 0, MAX_INST_LEN);
-    if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst)) 
-        != MAX_INST_LEN) 
-    {
-        gdprintk(XENLOG_ERR, "get guest instruction failed\n");
-        domain_crash(current->domain);
-        return;
-    }
+    valid_inst_len = MAX_INST_LEN -
+	    hvm_copy_from_guest_virt(inst, svm_rip2pointer(v), MAX_INST_LEN);
 
-    for (i = 0; i < MAX_INST_LEN; i++)
+    for (i = 0; i < valid_inst_len; i++)
     {
         switch (inst[i])
         {
@@ -1796,7 +1910,7 @@ static void mov_from_cr(int cr, int gp, 
  */
 static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
 {
-    unsigned long value, old_cr, old_base_mfn, mfn;
+    unsigned long value, old_cr;
     struct vcpu *v = current;
     struct vlapic *vlapic = vcpu_vlapic(v);
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -1814,57 +1928,8 @@ static int mov_to_cr(int gpreg, int cr, 
         return svm_set_cr0(value);
 
     case 3:
-        if ( paging_mode_hap(v->domain) )
-        {
-            vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value;
-            break;
-        }
-
-        /* If paging is not enabled yet, simply copy the value to CR3. */
-        if ( !svm_paging_enabled(v) )
-        {
-            v->arch.hvm_svm.cpu_cr3 = value;
-            break;
-        }
-
-        /* We make a new one if the shadow does not exist. */
-        if ( value == v->arch.hvm_svm.cpu_cr3 )
-        {
-            /* 
-             * This is simple TLB flush, implying the guest has 
-             * removed some translation or changed page attributes.
-             * We simply invalidate the shadow.
-             */
-            mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
-            if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
-                goto bad_cr3;
-            paging_update_cr3(v);
-            /* signal paging update to ASID handler */
-            svm_asid_g_mov_to_cr3 (v);
-        }
-        else 
-        {
-            /*
-             * If different, make a shadow. Check if the PDBR is valid
-             * first.
-             */
-            HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
-            mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
-            if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
-                goto bad_cr3;
-
-            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-            v->arch.guest_table = pagetable_from_pfn(mfn);
-
-            if ( old_base_mfn )
-                put_page(mfn_to_page(old_base_mfn));
-
-            v->arch.hvm_svm.cpu_cr3 = value;
-            update_cr3(v);
-            HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
-            /* signal paging update to ASID handler */
-            svm_asid_g_mov_to_cr3 (v);
-        }
+	if (svm_set_cr3(value) != X86EMUL_OKAY)
+	    goto bad_cr3;
         break;
 
     case 4: /* CR4 */
@@ -1984,26 +2049,26 @@ static int svm_cr_access(struct vcpu *v,
     enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
     enum instruction_index match;
 
-    inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer));
-
-    /* get index to first actual instruction byte - as we will need to know 
-       where the prefix lives later on */
-    index = skip_prefix_bytes(buffer, sizeof(buffer));
-    
     if ( type == TYPE_MOV_TO_CR )
     {
         inst_len = __get_instruction_length_from_list(
-            v, list_a, ARR_SIZE(list_a), &buffer[index], &match);
+            v, list_a, ARR_SIZE(list_a), &match);
     }
     else /* type == TYPE_MOV_FROM_CR */
     {
         inst_len = __get_instruction_length_from_list(
-            v, list_b, ARR_SIZE(list_b), &buffer[index], &match);
+            v, list_b, ARR_SIZE(list_b), &match);
     }
 
-    ASSERT(inst_len > 0);
+    if ( inst_len == 0 )
+      return 0;
 
-    inst_len += index;
+    memset(buffer, 0, MAX_INST_LEN);
+    hvm_copy_from_guest_virt(buffer, svm_rip2pointer(v), MAX_INST_LEN);
+
+    /* get index to first actual instruction byte - as we will need to know 
+       where the prefix lives later on */
+    index = skip_prefix_bytes(buffer, sizeof(buffer));
 
     /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
     if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
@@ -2078,7 +2143,7 @@ static int svm_cr_access(struct vcpu *v,
                 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: "
                          "%lx failed due to unhandled addressing mode."
                          "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
-                domain_crash(v->domain);
+                hvm_inject_exception(TRAP_gp_fault, 0, 0);
             }
             inst_len += addr_size;
             offset = *(( unsigned int *) ( void *) &buffer[index + 3]);
@@ -2092,7 +2157,7 @@ static int svm_cr_access(struct vcpu *v,
            gdprintk(XENLOG_ERR, "SMSW emulation at guest address: %lx "
                     "failed due to unhandled addressing mode!"
                     "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
-           domain_crash(v->domain);
+           hvm_inject_exception(TRAP_gp_fault, 0, 0);
         }
         break;
 
@@ -2100,8 +2165,6 @@ static int svm_cr_access(struct vcpu *v,
         BUG();
     }
 
-    ASSERT(inst_len);
-
     __update_guest_eip(vmcb, inst_len);
     
     return result;
@@ -2124,7 +2187,7 @@ static void svm_do_msr_access(
     {
         switch (ecx) {
         case MSR_IA32_TIME_STAMP_COUNTER:
-            msr_content = hvm_get_guest_time(v);
+            msr_content = hvm_get_guest_tsc(v);
             break;
 
         case MSR_IA32_APICBASE:
@@ -2187,7 +2250,8 @@ static void svm_do_msr_access(
         HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
                     ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
 
-        inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL);
+        if ( (inst_len = __get_instruction_length(v, INSTR_RDMSR)) == 0 )
+            return;
     }
     else
     {
@@ -2198,7 +2262,7 @@ static void svm_do_msr_access(
         switch (ecx)
         {
         case MSR_IA32_TIME_STAMP_COUNTER:
-            hvm_set_guest_time(v, msr_content);
+            hvm_set_guest_tsc(v, msr_content);
             pt_reset(v);
             break;
 
@@ -2216,7 +2280,8 @@ static void svm_do_msr_access(
             break;
         }
 
-        inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL);
+        if ( (inst_len = __get_instruction_length(v, INSTR_WRMSR)) == 0 )
+            return;
     }
 
     __update_guest_eip(vmcb, inst_len);
@@ -2224,11 +2289,14 @@ static void svm_do_msr_access(
 
 static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb)
 {
+    enum hvm_intack type = hvm_vcpu_has_pending_irq(current);
+
     __update_guest_eip(vmcb, 1);
 
     /* Check for interrupt not handled or new interrupt. */
-    if ( (vmcb->rflags & X86_EFLAGS_IF) &&
-         (vmcb->vintr.fields.irq || cpu_has_pending_irq(current)) ) {
+    if ( vmcb->eventinj.fields.v ||
+         ((type != hvm_intack_none) && hvm_interrupts_enabled(current, type)) )
+    {
         HVMTRACE_1D(HLT, current, /*int pending=*/ 1);
         return;
     }
@@ -2252,7 +2320,8 @@ static void svm_vmexit_do_invd(struct vc
      */
     gdprintk(XENLOG_WARNING, "INVD instruction intercepted - ignored\n");
     
-    inst_len = __get_instruction_length(v, INSTR_INVD, NULL);
+    if ( (inst_len = __get_instruction_length(v, INSTR_INVD)) == 0 )
+        return;
     __update_guest_eip(vmcb, inst_len);
 }    
         
@@ -2264,21 +2333,11 @@ void svm_handle_invlpg(const short invlp
     int inst_len;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
-    /* 
-     * Unknown how many bytes the invlpg instruction will take.  Use the
-     * maximum instruction length here
-     */
-    if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length)
-    {
-        gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length);
-        domain_crash(v->domain);
-        return;
-    }
-
     if (invlpga)
     {
-        inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode);
-        ASSERT(inst_len > 0);
+        if ( (inst_len =
+                  __get_instruction_length(v, INSTR_INVLPGA)) == 0 )
+            return;
         __update_guest_eip(vmcb, inst_len);
 
         /* 
@@ -2289,10 +2348,15 @@ void svm_handle_invlpg(const short invlp
     }
     else
     {
+        if ( (inst_len =
+                  __get_instruction_length(v, INSTR_INVLPG)) == 0 )
+            return;
+
+        memset(opcode, 0, MAX_INST_LEN);
+        hvm_copy_from_guest_virt(opcode, svm_rip2pointer(v), MAX_INST_LEN);
+
         /* What about multiple prefix codes? */
         prefix = (is_prefix(opcode[0])?opcode[0]:0);
-        inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode);
-        ASSERT(inst_len > 0);
 
         inst_len--;
         length -= inst_len;
@@ -2413,8 +2477,19 @@ asmlinkage void svm_vmexit_handler(struc
     unsigned long eip;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    eventinj_t eventinj;
     int inst_len, rc;
 
+    /*
+     * Before doing anything else, we need to sync up the VLAPIC's TPR with
+     * SVM's vTPR if CR8 writes are currently disabled.  It's OK if the
+     * guest doesn't touch the CR8 (e.g. 32-bit Windows) because we update
+     * the vTPR on MMIO writes to the TPR
+     */
+    if ( !(vmcb->cr_intercepts & CR_INTERCEPT_CR8_WRITE) )
+        vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
+                       (vmcb->vintr.fields.tpr & 0x0F) << 4);
+ 
     exit_reason = vmcb->exitcode;
 
     HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason);
@@ -2428,6 +2503,15 @@ asmlinkage void svm_vmexit_handler(struc
     perfc_incra(svmexits, exit_reason);
     eip = vmcb->rip;
 
+    /* Event delivery caused this intercept? Queue for redelivery. */
+    eventinj = vmcb->exitintinfo;
+    if ( unlikely(eventinj.fields.v) &&
+         svm_event_needs_reinjection(eventinj.fields.type,
+                                     eventinj.fields.vector) )
+        vmcb->eventinj = eventinj;
+
+    hvm_maybe_deassert_evtchn_irq();
+
     switch ( exit_reason )
     {
     case VMEXIT_INTR:
@@ -2455,8 +2539,10 @@ asmlinkage void svm_vmexit_handler(struc
         if ( !v->domain->debugger_attached )
             goto exit_and_crash;
         /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
-        inst_len = __get_instruction_length(v, INSTR_INT3, NULL);
+        if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 )
+            break;
         __update_guest_eip(vmcb, inst_len);
+        current->arch.gdbsx_vcpu_event = TRAP_int3;
         domain_pause_for_debugger();
         break;
 
@@ -2499,12 +2585,20 @@ asmlinkage void svm_vmexit_handler(struc
         svm_vmexit_do_invd(v);
         break;
 
-    case VMEXIT_GDTR_WRITE:
-        printk("WRITE to GDTR\n");
+    case VMEXIT_TASK_SWITCH: {
+        enum hvm_task_switch_reason reason;
+        int32_t errcode = -1;
+        if ( (vmcb->exitinfo2 >> 36) & 1 )
+            reason = TSW_iret;
+        else if ( (vmcb->exitinfo2 >> 38) & 1 )
+            reason = TSW_jmp;
+        else
+            reason = TSW_call_or_int;
+        if ( (vmcb->exitinfo2 >> 44) & 1 )
+            errcode = (uint32_t)vmcb->exitinfo2;
+        hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode);
         break;
-
-    case VMEXIT_TASK_SWITCH:
-        goto exit_and_crash;
+    }
 
     case VMEXIT_CPUID:
         svm_vmexit_do_cpuid(vmcb, regs);
@@ -2523,8 +2617,8 @@ asmlinkage void svm_vmexit_handler(struc
         break;
 
     case VMEXIT_VMMCALL:
-        inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL);
-        ASSERT(inst_len > 0);
+        if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
+            break;
         HVMTRACE_1D(VMMCALL, v, regs->eax);
         rc = hvm_do_hypercall(regs);
         if ( rc != HVM_HCALL_preempted )
diff -Naurp xen/arch/x86/hvm/svm/vmcb.c xen-redhat/arch/x86/hvm/svm/vmcb.c
--- xen/arch/x86/hvm/svm/vmcb.c
+++ xen-redhat/arch/x86/hvm/svm/vmcb.c
@@ -129,8 +129,14 @@ static int construct_vmcb(struct vcpu *v
     /* Intercept all debug-register writes. */
     vmcb->dr_intercepts = ~0u;
 
-    /* Intercept all control-register accesses, except to CR2. */
-    vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE);
+    /*
+     * Intercept all control-register accesses except for CR2 reads/writes
+     * and CR8 reads (and actually CR8 writes, but that's a special case
+     * that's handled in svm/intr.c). 
+     */
+    vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ |
+                            CR_INTERCEPT_CR2_WRITE |
+                            CR_INTERCEPT_CR8_READ);
 
     /* I/O and MSR permission bitmaps. */
     arch_svm->msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE));
diff -Naurp xen/arch/x86/hvm/vioapic.c xen-redhat/arch/x86/hvm/vioapic.c
--- xen/arch/x86/hvm/vioapic.c
+++ xen-redhat/arch/x86/hvm/vioapic.c
@@ -92,9 +92,9 @@ static unsigned long vioapic_read_indire
     return result;
 }
 
-static unsigned long vioapic_read(struct vcpu *v,
-                                  unsigned long addr,
-                                  unsigned long length)
+static int vioapic_read(
+    struct vcpu *v, unsigned long addr,
+    unsigned long length, unsigned long *pval)
 {
     struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
     uint32_t result;
@@ -118,11 +118,13 @@ static unsigned long vioapic_read(struct
         break;
     }
 
-    return result;
+    *pval = result;
+    return 1;
 }
 
 static void vioapic_write_redirent(
-    struct hvm_hw_vioapic *vioapic, unsigned int idx, int top_word, uint32_t val)
+    struct hvm_hw_vioapic *vioapic, unsigned int idx,
+    int top_word, uint32_t val)
 {
     struct domain *d = vioapic_domain(vioapic);
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
@@ -200,10 +202,9 @@ static void vioapic_write_indirect(
     }
 }
 
-static void vioapic_write(struct vcpu *v,
-                          unsigned long addr,
-                          unsigned long length,
-                          unsigned long val)
+static int vioapic_write(
+    struct vcpu *v, unsigned long addr,
+    unsigned long length, unsigned long val)
 {
     struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
 
@@ -228,6 +229,8 @@ static void vioapic_write(struct vcpu *v
     default:
         break;
     }
+
+    return 1;
 }
 
 static int vioapic_range(struct vcpu *v, unsigned long addr)
@@ -254,17 +257,11 @@ static void ioapic_inj_irq(
     HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "irq %d trig %d deliv %d",
                 vector, trig_mode, delivery_mode);
 
-    switch ( delivery_mode )
-    {
-    case dest_Fixed:
-    case dest_LowestPrio:
-        if ( vlapic_set_irq(target, vector, trig_mode) )
-            vcpu_kick(vlapic_vcpu(target));
-        break;
-    default:
-        gdprintk(XENLOG_WARNING, "error delivery mode %d\n", delivery_mode);
-        break;
-    }
+    ASSERT((delivery_mode == dest_Fixed) ||
+           (delivery_mode == dest_LowestPrio));
+
+    if ( vlapic_set_irq(target, vector, trig_mode) )
+        vcpu_kick(vlapic_vcpu(target));
 }
 
 static uint32_t ioapic_get_delivery_bitmask(
@@ -311,7 +308,7 @@ static inline int pit_channel0_enabled(v
 {
     PITState *pit = &current->domain->arch.hvm_domain.pl_time.vpit;
     struct periodic_time *pt = &pit->pt[0];
-    return pt->enabled;
+    return pt_active(pt);
 }
 
 static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq)
@@ -393,10 +390,21 @@ static void vioapic_deliver(struct hvm_h
         break;
     }
 
-    case dest_SMI:
     case dest_NMI:
-    case dest_INIT:
-    case dest__reserved_2:
+    {
+        uint8_t bit;
+        for ( bit = 0; deliver_bitmask != 0; bit++ )
+        {
+            if ( !(deliver_bitmask & (1 << bit)) )
+                continue;
+            deliver_bitmask &= ~(1 << bit);
+            if ( ((v = vioapic_domain(vioapic)->vcpu[bit]) != NULL) &&
+                 !test_and_set_bool(v->arch.hvm_vcpu.nmi_pending) )
+                vcpu_kick(v);
+        }
+        break;
+    }
+
     default:
         gdprintk(XENLOG_WARNING, "Unsupported delivery mode %d\n",
                  delivery_mode);
@@ -458,6 +466,14 @@ void vioapic_update_EOI(struct domain *d
     ent = &vioapic->redirtbl[gsi];
 
     ent->fields.remote_irr = 0;
+
+    if ( iommu_enabled )
+    {
+        spin_unlock(&d->arch.hvm_domain.irq_lock);
+        hvm_dpci_eoi(current->domain, gsi, ent);
+        spin_lock(&d->arch.hvm_domain.irq_lock);
+    }
+
     if ( (ent->fields.trig_mode == VIOAPIC_LEVEL_TRIG) &&
          !ent->fields.mask &&
          hvm_irq->gsi_assert_count[gsi] )
diff -Naurp xen/arch/x86/hvm/vlapic.c xen-redhat/arch/x86/hvm/vlapic.c
--- xen/arch/x86/hvm/vlapic.c
+++ xen-redhat/arch/x86/hvm/vlapic.c
@@ -67,9 +67,6 @@ static unsigned int vlapic_lvt_mask[VLAP
 #define APIC_DEST_NOSHORT                0x0
 #define APIC_DEST_MASK                   0x800
 
-#define vlapic_lvt_enabled(vlapic, lvt_type)                    \
-    (!(vlapic_get_reg(vlapic, lvt_type) & APIC_LVT_MASKED))
-
 #define vlapic_lvt_vector(vlapic, lvt_type)                     \
     (vlapic_get_reg(vlapic, lvt_type) & APIC_VECTOR_MASK)
 
@@ -293,7 +290,8 @@ static int vlapic_accept_irq(struct vcpu
         break;
 
     case APIC_DM_NMI:
-        gdprintk(XENLOG_WARNING, "Ignoring guest NMI\n");
+        if ( !test_and_set_bool(v->arch.hvm_vcpu.nmi_pending) )
+            vcpu_kick(v);
         break;
 
     case APIC_DM_INIT:
@@ -376,6 +374,8 @@ void vlapic_EOI_set(struct vlapic *vlapi
 
     if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) )
         vioapic_update_EOI(vlapic_domain(vlapic), vector);
+
+    hvm_dpci_msi_eoi(current->domain, vector);
 }
 
 static void vlapic_ipi(struct vlapic *vlapic)
@@ -428,8 +428,7 @@ static uint32_t vlapic_get_tmcct(struct 
     uint32_t tmcct, tmict = vlapic_get_reg(vlapic, APIC_TMICT);
     uint64_t counter_passed;
 
-    counter_passed = (hvm_get_guest_time(v) - vlapic->pt.last_plt_gtime) // TSC
-                     * 1000000000ULL / ticks_per_sec(v) // NS
+    counter_passed = (hvm_get_guest_time(v) - vlapic->timer_last_update)
                      / APIC_BUS_CYCLE_NS / vlapic->hw.timer_divisor;
     tmcct = tmict - counter_passed;
 
@@ -476,17 +475,18 @@ static void vlapic_read_aligned(struct v
     }
 }
 
-static unsigned long vlapic_read(struct vcpu *v, unsigned long address,
-                                 unsigned long len)
+static int vlapic_read(
+    struct vcpu *v, unsigned long address,
+    unsigned long len, unsigned long *pval)
 {
     unsigned int alignment;
     unsigned int tmp;
-    unsigned long result;
+    unsigned long result = 0;
     struct vlapic *vlapic = vcpu_vlapic(v);
     unsigned int offset = address - vlapic_base_address(vlapic);
 
     if ( offset > APIC_TDCR )
-        return 0;
+        goto out;
 
     /* some bugs on kernel cause read this with byte*/
     if ( len != 4 )
@@ -522,15 +522,22 @@ static unsigned long vlapic_read(struct 
     HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset 0x%x with length 0x%lx, "
                 "and the result is 0x%lx", offset, len, result);
 
-    return result;
+    goto out;
 
  exit_and_crash:
     domain_crash(v->domain);
-    return 0;
+ out:
+    *pval = result;
+    return 1;
 }
 
-static void vlapic_write(struct vcpu *v, unsigned long address,
-                         unsigned long len, unsigned long val)
+void vlapic_pt_cb(struct vcpu *v, void *data)
+{
+    *(s_time_t *)data = hvm_get_guest_time(v);
+}
+
+static int vlapic_write(struct vcpu *v, unsigned long address,
+                        unsigned long len, unsigned long val)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
     unsigned int offset = address - vlapic_base_address(vlapic);
@@ -547,13 +554,13 @@ static void vlapic_write(struct vcpu *v,
     val &= 0xffffffff;
     if ( len != 4 )
     {
-        unsigned int tmp;
+        unsigned long tmp;
         unsigned char alignment;
 
         gdprintk(XENLOG_INFO, "Notice: Local APIC write with len = %lx\n",len);
 
         alignment = offset & 0x3;
-        tmp = vlapic_read(v, offset & ~0x3, 4);
+        (void)vlapic_read(v, offset & ~0x3, 4, &tmp);
 
         switch ( len )
         {
@@ -579,7 +586,7 @@ static void vlapic_write(struct vcpu *v,
                      "should be 4 instead\n", len);
         exit_and_crash:
             domain_crash(v->domain);
-            return;
+            return 0;
         }
     }
 
@@ -650,6 +657,8 @@ static void vlapic_write(struct vcpu *v,
             val |= APIC_LVT_MASKED;
         val &= vlapic_lvt_mask[(offset - APIC_LVTT) >> 4];
         vlapic_set_reg(vlapic, offset, val);
+        if ( offset == APIC_LVT0 )
+	    vlapic_adjust_i8259_target(v->domain);
         break;
 
     case APIC_TMICT:
@@ -658,7 +667,9 @@ static void vlapic_write(struct vcpu *v,
 
         vlapic_set_reg(vlapic, APIC_TMICT, val);
         create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq,
-                             !vlapic_lvtt_period(vlapic), NULL, vlapic);
+                             !vlapic_lvtt_period(vlapic), vlapic_pt_cb,
+                             &vlapic->timer_last_update);
+        vlapic->timer_last_update = vlapic->pt.last_plt_gtime;
 
         HVM_DBG_LOG(DBG_LEVEL_VLAPIC,
                     "bus cycle is %uns, "
@@ -678,6 +689,8 @@ static void vlapic_write(struct vcpu *v,
                  "Local APIC Write to read-only register 0x%x\n", offset);
         break;
     }
+
+    return 1;
 }
 
 static int vlapic_range(struct vcpu *v, unsigned long addr)
@@ -714,18 +727,44 @@ void vlapic_msr_set(struct vlapic *vlapi
                 "apic base msr is 0x%016"PRIx64, vlapic->hw.apic_base_msr);
 }
 
-int vlapic_accept_pic_intr(struct vcpu *v)
+static int __vlapic_accept_pic_intr(struct vcpu *v)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
     uint32_t lvt0 = vlapic_get_reg(vlapic, APIC_LVT0);
 
     /*
-     * Only CPU0 is wired to the 8259A. INTA cycles occur if LINT0 is set up
-     * accept ExtInts, or if the LAPIC is disabled (so LINT0 behaves as INTR).
+     * INTA cycles occur if LINT0 is set up to accept ExtInts, of it
+     * the LAPIC is disabled (so LINT0 behaves as INTR).
      */
-    return ((v->vcpu_id == 0) &&
-            (((lvt0 & (APIC_MODE_MASK|APIC_LVT_MASKED)) == APIC_DM_EXTINT) ||
-             vlapic_hw_disabled(vlapic)));
+    return ((lvt0 & (APIC_MODE_MASK|APIC_LVT_MASKED)) == APIC_DM_EXTINT ||
+            /* LAPIC is fully disabled? */
+            vlapic_hw_disabled(vlapic));
+}
+
+int vlapic_accept_pic_intr(struct vcpu *v)
+{
+    /* By default, deliver 8259A interrupts to CPU0.  */
+    return ((v->domain->arch.hvm_domain.i8259_target
+	     ? v == v->domain->arch.hvm_domain.i8259_target
+	     : v->vcpu_id == 0) &&
+            __vlapic_accept_pic_intr(v));
+}
+
+void vlapic_adjust_i8259_target(struct domain *d)
+{
+    struct vcpu *v;
+
+    for_each_vcpu ( d, v )
+        if ( __vlapic_accept_pic_intr(v) )
+            goto found;
+
+    v = d->vcpu ? d->vcpu[0] : NULL;
+
+ found:
+    if ( d->arch.hvm_domain.i8259_target == v )
+        return;
+    d->arch.hvm_domain.i8259_target = v;
+    pt_adjust_global_vcpu_target(v);
 }
 
 int vlapic_has_interrupt(struct vcpu *v)
@@ -744,7 +783,7 @@ int vlapic_has_interrupt(struct vcpu *v)
     return highest_irr;
 }
 
-int cpu_get_apic_interrupt(struct vcpu *v, int *mode)
+int cpu_get_apic_interrupt(struct vcpu *v)
 {
     int vector = vlapic_has_interrupt(v);
     struct vlapic *vlapic = vcpu_vlapic(v);
@@ -754,8 +793,6 @@ int cpu_get_apic_interrupt(struct vcpu *
  
     vlapic_set_vector(vector, &vlapic->regs->data[APIC_ISR]);
     vlapic_clear_irr(vector, vlapic);
-
-    *mode = APIC_DM_FIXED;
     return vector;
 }
 
@@ -817,7 +854,9 @@ static void lapic_rearm(struct vlapic *s
 
         s->pt.irq = lvtt & APIC_VECTOR_MASK;
         create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
-                             !vlapic_lvtt_period(s), NULL, s);
+                             !vlapic_lvtt_period(s), vlapic_pt_cb,
+                             &s->timer_last_update);
+        s->timer_last_update = s->pt.last_plt_gtime;
 
         printk("lapic_load to rearm the actimer:"
                     "bus cycle is %uns, "
@@ -898,6 +937,7 @@ static int lapic_load_regs(struct domain
     if ( hvm_load_entry(LAPIC_REGS, h, s->regs) != 0 ) 
         return -EINVAL;
 
+    vlapic_adjust_i8259_target(d);
     lapic_rearm(s);
     return 0;
 }
@@ -913,6 +953,8 @@ int vlapic_init(struct vcpu *v)
 
     HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "%d", v->vcpu_id);
 
+    vlapic->pt.source = PTSRC_lapic;
+
     vlapic->regs_page = alloc_domheap_page(NULL);
     if ( vlapic->regs_page == NULL )
     {
@@ -949,18 +991,3 @@ void vlapic_destroy(struct vcpu *v)
     unmap_domain_page_global(vlapic->regs);
     free_domheap_page(vlapic->regs_page);
 }
-
-int is_lvtt(struct vcpu *v, int vector)
-{
-    return vcpu_vlapic(v)->pt.enabled &&
-           vector == vlapic_lvt_vector(vcpu_vlapic(v), APIC_LVTT);
-}
-
-int is_lvtt_enabled(struct vcpu *v)
-{
-    if ( unlikely(!vlapic_enabled(vcpu_vlapic(v))) ||
-            !vlapic_lvt_enabled(vcpu_vlapic(v), APIC_LVTT)) 
-        return 0;
-
-    return 1;
-}
diff -Naurp xen/arch/x86/hvm/vmsi.c xen-redhat/arch/x86/hvm/vmsi.c
--- xen/arch/x86/hvm/vmsi.c
+++ xen-redhat/arch/x86/hvm/vmsi.c
@@ -0,0 +1,493 @@
+/*
+ *  Copyright (C) 2001  MandrakeSoft S.A.
+ *
+ *    MandrakeSoft S.A.
+ *    43, rue d'Aboukir
+ *    75002 Paris - France
+ *    http://www.linux-mandrake.com/
+ *    http://www.mandrakesoft.com/
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Support for virtual MSI logic
+ * Will be merged it with virtual IOAPIC logic, since most is the same
+*/
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <public/hvm/ioreq.h>
+#include <asm/hvm/io.h>
+#include <asm/hvm/vpic.h>
+#include <asm/hvm/vlapic.h>
+#include <asm/hvm/support.h>
+#include <asm/current.h>
+#include <asm/event.h>
+
+static uint32_t vmsi_get_delivery_bitmask(
+    struct domain *d, uint16_t dest, uint8_t dest_mode)
+{
+    uint32_t mask = 0;
+    struct vcpu *v;
+
+    HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic_get_delivery_bitmask "
+                "dest %d dest_mode %d\n", dest, dest_mode);
+
+    if ( dest_mode == 0 ) /* Physical mode. */
+    {
+        if ( dest == 0xFF ) /* Broadcast. */
+        {
+            for_each_vcpu ( d, v )
+                mask |= 1 << v->vcpu_id;
+            goto out;
+        }
+
+        for_each_vcpu ( d, v )
+        {
+            if ( VLAPIC_ID(vcpu_vlapic(v)) == dest )
+            {
+                mask = 1 << v->vcpu_id;
+                break;
+            }
+        }
+    }
+    else if ( dest != 0 ) /* Logical mode, MDA non-zero. */
+    {
+        for_each_vcpu ( d, v )
+            if ( vlapic_match_logical_addr(vcpu_vlapic(v), dest) )
+                mask |= 1 << v->vcpu_id;
+    }
+
+ out:
+    HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic_get_delivery_bitmask mask %x\n",
+                mask);
+    return mask;
+}
+
+static void vmsi_inj_irq(
+    struct domain *d,
+    struct vlapic *target,
+    uint8_t vector,
+    uint8_t trig_mode,
+    uint8_t delivery_mode)
+{
+    HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic_inj_irq "
+                "irq %d trig %d delive mode %d\n",
+                vector, trig_mode, delivery_mode);
+
+    switch ( delivery_mode )
+    {
+    case dest_Fixed:
+    case dest_LowestPrio:
+        if ( vlapic_set_irq(target, vector, trig_mode) )
+            vcpu_kick(vlapic_vcpu(target));
+        break;
+    default:
+        gdprintk(XENLOG_WARNING, "error delivery mode %d\n", delivery_mode);
+        break;
+    }
+}
+
+#define VMSI_DEST_ID_MASK 0xff
+#define VMSI_RH_MASK      0x100
+#define VMSI_DM_MASK      0x200
+#define VMSI_DELIV_MASK   0x7000
+#define VMSI_TRIG_MODE    0x8000
+
+#define GFLAGS_SHIFT_DEST_ID        0
+#define GFLAGS_SHIFT_RH             8
+#define GFLAGS_SHIFT_DM             9
+#define GLFAGS_SHIFT_DELIV_MODE     12
+#define GLFAGS_SHIFT_TRG_MODE       15
+
+int vmsi_deliver(struct domain *d, int pirq)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
+    uint32_t flags = hvm_irq_dpci->mirq[pirq].gmsi.gflags;
+    int vector = hvm_irq_dpci->mirq[pirq].gmsi.gvec;
+    uint16_t dest = (flags & VMSI_DEST_ID_MASK) >> GFLAGS_SHIFT_DEST_ID;
+    uint8_t dest_mode = (flags & VMSI_DM_MASK) >> GFLAGS_SHIFT_DM;
+    uint8_t delivery_mode = (flags & VMSI_DELIV_MASK) >> GLFAGS_SHIFT_DELIV_MODE;
+    uint8_t trig_mode = (flags & VMSI_TRIG_MODE) >> GLFAGS_SHIFT_TRG_MODE;
+    uint32_t deliver_bitmask;
+    struct vlapic *target;
+    struct vcpu *v;
+
+    HVM_DBG_LOG(DBG_LEVEL_IOAPIC,
+                "msi: dest=%x dest_mode=%x delivery_mode=%x "
+                "vector=%x trig_mode=%x\n",
+                dest, dest_mode, delivery_mode, vector, trig_mode);
+
+    if ( !test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags) )
+    {
+        gdprintk(XENLOG_WARNING, "pirq %x not msi \n", pirq);
+        return 0;
+    }
+
+    deliver_bitmask = vmsi_get_delivery_bitmask(d, dest, dest_mode);
+    if ( !deliver_bitmask )
+    {
+        HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "ioapic deliver "
+                    "no target on destination\n");
+        return 0;
+    }
+
+    switch ( delivery_mode )
+    {
+    case dest_LowestPrio:
+    {
+        /* N.B. backport, from apic_lowest_prio, vector is not used */
+        target = apic_round_robin(d, 0, deliver_bitmask);
+        if ( target != NULL )
+            vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode);
+        else
+            HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "null round robin: "
+                        "mask=%x vector=%x delivery_mode=%x\n",
+                        deliver_bitmask, vector, dest_LowestPrio);
+        break;
+    }
+
+    case dest_Fixed:
+    case dest_ExtINT:
+    {
+        uint8_t bit;
+        for ( bit = 0; deliver_bitmask != 0; bit++ )
+        {
+            if ( !(deliver_bitmask & (1 << bit)) )
+                continue;
+            deliver_bitmask &= ~(1 << bit);
+            v = d->vcpu[bit];
+            if ( v != NULL )
+            {
+                target = vcpu_vlapic(v);
+                vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode);
+            }
+        }
+        break;
+    }
+
+    case dest_SMI:
+    case dest_NMI:
+    case dest_INIT:
+    case dest__reserved_2:
+    default:
+        gdprintk(XENLOG_WARNING, "Unsupported delivery mode %d\n",
+                 delivery_mode);
+        break;
+    }
+    return 1;
+}
+
+/* MSI-X mask bit hypervisor interception */
+struct msixtbl_entry
+{
+    struct list_head list;
+    atomic_t refcnt;    /* how many bind_pt_irq called for the device */
+
+    /* TODO: resolve the potential race by destruction of pdev */
+    struct pci_dev *pdev;
+    unsigned long gtable;       /* gpa of msix table */
+    unsigned long table_len;
+    unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1];
+
+    struct rcu_head rcu;
+};
+
+static struct msixtbl_entry *msixtbl_find_entry(
+    struct vcpu *v, unsigned long addr)
+{
+    struct msixtbl_entry *entry;
+    struct domain *d = v->domain;
+
+    list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list )
+        if ( addr >= entry->gtable &&
+             addr < entry->gtable + entry->table_len )
+            return entry;
+
+    return NULL;
+}
+
+static void __iomem *msixtbl_addr_to_virt(
+    struct msixtbl_entry *entry, unsigned long addr)
+{
+    int idx, nr_page;
+
+    if ( !entry )
+        return NULL;
+
+    nr_page = (addr >> PAGE_SHIFT) -
+              (entry->gtable >> PAGE_SHIFT);
+
+    if ( !entry->pdev )
+        return NULL;
+
+    idx = entry->pdev->msix_table_idx[nr_page];
+    if ( !idx )
+        return NULL;
+
+    return (void *)(fix_to_virt(idx) +
+                    (addr & ((1UL << PAGE_SHIFT) - 1)));
+}
+
+static int msixtbl_read(
+    struct vcpu *v, unsigned long address,
+    unsigned long len, unsigned long *pval)
+{
+    unsigned long offset;
+    struct msixtbl_entry *entry;
+    void *virt;
+    int r = 0;
+
+    rcu_read_lock();
+
+    if ( len != 4 )
+        goto out;
+
+    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
+    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+        goto out;
+
+    entry = msixtbl_find_entry(v, address);
+    virt = msixtbl_addr_to_virt(entry, address);
+    if ( !virt )
+        goto out;
+
+    *pval = readl(virt);
+    r = 1;
+
+out:
+    rcu_read_unlock();
+    return r;
+}
+
+static int msixtbl_write(struct vcpu *v, unsigned long address,
+                        unsigned long len, unsigned long val)
+{
+    unsigned long offset;
+    struct msixtbl_entry *entry;
+    void *virt;
+    int nr_entry;
+    int r = 0;
+
+    rcu_read_lock();
+
+    if ( len != 4 )
+        goto out;
+
+    entry = msixtbl_find_entry(v, address);
+    nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+
+    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
+    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+    {
+        set_bit(nr_entry, &entry->table_flags);
+        goto out;
+    }
+
+    /* exit to device model if address/data has been modified */
+    if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
+        goto out;
+
+    virt = msixtbl_addr_to_virt(entry, address);
+    if ( !virt )
+        goto out;
+
+    writel(val, virt);
+    r = 1;
+
+out:
+    rcu_read_unlock();
+    return r;
+}
+
+static int msixtbl_range(struct vcpu *v, unsigned long addr)
+{
+    struct msixtbl_entry *entry;
+    void *virt;
+
+    rcu_read_lock();
+
+    entry = msixtbl_find_entry(v, addr);
+    virt = msixtbl_addr_to_virt(entry, addr);
+
+    rcu_read_unlock();
+
+    return !!virt;
+}
+
+struct hvm_mmio_handler msixtbl_mmio_handler = {
+    .check_handler = msixtbl_range,
+    .read_handler = msixtbl_read,
+    .write_handler = msixtbl_write
+};
+
+static void add_msixtbl_entry(struct domain *d,
+                              struct pci_dev *pdev,
+                              uint64_t gtable,
+                              struct msixtbl_entry *entry)
+{
+    u32 len;
+
+    memset(entry, 0, sizeof(struct msixtbl_entry));
+
+    INIT_LIST_HEAD(&entry->list);
+    INIT_RCU_HEAD(&entry->rcu);
+    atomic_set(&entry->refcnt, 0);
+
+    len = pci_msix_get_table_len(pdev);
+    entry->table_len = len;
+    entry->pdev = pdev;
+    entry->gtable = (unsigned long) gtable;
+
+    list_add_rcu(&entry->list, &d->arch.hvm_domain.msixtbl_list);
+}
+
+static void free_msixtbl_entry(struct rcu_head *rcu)
+{
+    struct msixtbl_entry *entry;
+
+    entry = container_of (rcu, struct msixtbl_entry, rcu);
+
+    xfree(entry);
+}
+
+static void del_msixtbl_entry(struct msixtbl_entry *entry)
+{
+    list_del_rcu(&entry->list);
+    call_rcu(&entry->rcu, free_msixtbl_entry);
+}
+
+int msixtbl_pt_register(struct domain *d, int pirq)
+{
+    irq_desc_t *irq_desc;
+    struct msi_desc *msi_desc;
+    struct pci_dev *pdev;
+    struct msixtbl_entry *entry, *new_entry;
+    int r = -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    /*
+     * xmalloc() with irq_disabled causes the failure of check_lock() 
+     * for xenpool->lock. So we allocate an entry beforehand.
+     */
+    new_entry = xmalloc(struct msixtbl_entry);
+    if ( !new_entry )
+        return -ENOMEM;
+
+    irq_desc = domain_spin_lock_irq_desc(d, pirq, NULL);
+    if ( !irq_desc )
+    {
+        xfree(new_entry);
+        return r;
+    }
+
+    if ( irq_desc->handler != &pci_msi_type )
+        goto out;
+
+    msi_desc = irq_desc->msi_desc;
+    if ( !msi_desc )
+        goto out;
+
+    pdev = msi_desc->dev;
+    if ( !pdev->msix_table )
+    {
+        r = 0;    /* msix_table is not mandatory */
+        goto out;
+    }
+
+    spin_lock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list )
+        if ( pdev == entry->pdev )
+            goto found;
+
+    entry = new_entry;
+    new_entry = NULL;
+    add_msixtbl_entry(d, pdev, pdev->msix_table, entry);
+
+found:
+    atomic_inc(&entry->refcnt);
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+    r = 0;
+
+out:
+    spin_unlock_irq(&irq_desc->lock);
+    xfree(new_entry);
+    return r;
+}
+
+void msixtbl_pt_unregister(struct domain *d, int pirq)
+{
+    irq_desc_t *irq_desc;
+    struct msi_desc *msi_desc;
+    struct pci_dev *pdev;
+    struct msixtbl_entry *entry;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    irq_desc = domain_spin_lock_irq_desc(d, pirq, NULL);
+    if ( !irq_desc )
+        return;
+
+    if ( irq_desc->handler != &pci_msi_type )
+        goto out;
+
+    msi_desc = irq_desc->msi_desc;
+    if ( !msi_desc )
+        goto out;
+
+    pdev = msi_desc->dev;
+
+    spin_lock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list )
+        if ( pdev == entry->pdev )
+            goto found;
+
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+
+out:
+    spin_unlock_irq(&irq_desc->lock);
+    return;
+
+found:
+    if ( !atomic_dec_and_test(&entry->refcnt) )
+        del_msixtbl_entry(entry);
+
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+    spin_unlock_irq(&irq_desc->lock);
+}
+void msixtbl_pt_cleanup(struct domain *d, int pirq)
+{
+    struct msixtbl_entry *entry, *temp;
+    unsigned long flags;
+
+    /* msixtbl_list_lock must be acquired with irq_disabled for check_lock() */
+    local_irq_save(flags);
+    spin_lock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    list_for_each_entry_safe( entry, temp,
+                              &d->arch.hvm_domain.msixtbl_list, list )
+        del_msixtbl_entry(entry);
+
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+    local_irq_restore(flags);
+}
diff -Naurp xen/arch/x86/hvm/vmx/intr.c xen-redhat/arch/x86/hvm/vmx/intr.c
--- xen/arch/x86/hvm/vmx/intr.c
+++ xen-redhat/arch/x86/hvm/vmx/intr.c
@@ -71,13 +71,38 @@
  * the effect is cleared. (i.e., MOV-SS-blocking 'dominates' STI-blocking).
  */
 
-static void enable_irq_window(struct vcpu *v)
+static void enable_intr_window(struct vcpu *v, enum hvm_intack intr_source)
 {
-    u32  *cpu_exec_control = &v->arch.hvm_vmx.exec_control;
-    
-    if ( !(*cpu_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) )
+    u32 *cpu_exec_control = &v->arch.hvm_vcpu.u.vmx.exec_control;
+    u32 ctl = CPU_BASED_VIRTUAL_INTR_PENDING;
+
+    ASSERT(intr_source != hvm_intack_none);
+
+    if ( (intr_source == hvm_intack_nmi) && cpu_has_vmx_vnmi )
+    {
+        /*
+         * We set MOV-SS blocking in lieu of STI blocking when delivering an
+         * NMI. This is because it is processor-specific whether STI-blocking
+         * blocks NMIs. Hence we *must* check for STI-blocking on NMI delivery
+         * (otherwise vmentry will fail on processors that check for STI-
+         * blocking) but if the processor does not check for STI-blocking then
+         * we may immediately vmexit and hance make no progress!
+         * (see SDM 3B 21.3, "Other Causes of VM Exits").
+         */
+        u32 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
+        if ( intr_shadow & VMX_INTR_SHADOW_STI )
+        {
+            /* Having both STI-blocking and MOV-SS-blocking fails vmentry. */
+            intr_shadow &= ~VMX_INTR_SHADOW_STI;
+            intr_shadow |= VMX_INTR_SHADOW_MOV_SS;
+            __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
+        }
+        ctl = CPU_BASED_VIRTUAL_NMI_PENDING;
+    }
+
+    if ( !(*cpu_exec_control & ctl) )
     {
-        *cpu_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+        *cpu_exec_control |= ctl;
         __vmwrite(CPU_BASED_VM_EXEC_CONTROL, *cpu_exec_control);
     }
 }
@@ -107,77 +132,51 @@ static void update_tpr_threshold(struct 
 
 asmlinkage void vmx_intr_assist(void)
 {
-    int has_ext_irq, intr_vector, intr_type = 0;
-    unsigned long eflags, intr_shadow;
+    int intr_vector;
+    enum hvm_intack intr_source;
     struct vcpu *v = current;
-    unsigned int idtv_info_field;
-    unsigned long inst_len;
+    unsigned int intr_info;
 
+    /* Crank the handle on interrupt state. */
     pt_update_irq(v);
+    hvm_dirq_assist(v);
 
-    hvm_set_callback_irq_level();
-
-    update_tpr_threshold(vcpu_vlapic(v));
-
-    has_ext_irq = cpu_has_pending_irq(v);
-
-    if ( unlikely(v->arch.hvm_vmx.vector_injected) )
-    {
-        v->arch.hvm_vmx.vector_injected = 0;
-        if ( unlikely(has_ext_irq) )
-            enable_irq_window(v);
-        return;
-    }
-
-    /* This could be moved earlier in the VMX resume sequence. */
-    idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
-    if ( unlikely(idtv_info_field & INTR_INFO_VALID_MASK) )
-    {
-        __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
+    do {
+        intr_source = hvm_vcpu_has_pending_irq(v);
+        if ( likely(intr_source == hvm_intack_none) )
+            goto out;
 
         /*
-         * Safe: the length will only be interpreted for software exceptions
-         * and interrupts. If we get here then delivery of some event caused a
-         * fault, and this always results in defined VM_EXIT_INSTRUCTION_LEN.
+         * An event is already pending or the pending interrupt is masked?
+         * Then the pending interrupt must be delayed.
          */
-        inst_len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe */
-        __vmwrite(VM_ENTRY_INSTRUCTION_LEN, inst_len);
+        intr_info = __vmread(VM_ENTRY_INTR_INFO_FIELD);
+        if ( unlikely(intr_info & INTR_INFO_VALID_MASK) ||
+             !hvm_interrupts_enabled(v, intr_source) )
+        {
+            enable_intr_window(v, intr_source);
+            goto out;
+        }
+    } while ( !hvm_vcpu_ack_pending_irq(v, intr_source, &intr_vector) );
 
-        if ( unlikely(idtv_info_field & 0x800) ) /* valid error code */
-            __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                      __vmread(IDT_VECTORING_ERROR_CODE));
-        if ( unlikely(has_ext_irq) )
-            enable_irq_window(v);
-
-        HVM_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
-        return;
-    }
-
-    if ( likely(!has_ext_irq) )
-        return;
-
-    intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
-    if ( unlikely(intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS)) )
+    if ( intr_source == hvm_intack_nmi )
     {
-        enable_irq_window(v);
-        HVM_DBG_LOG(DBG_LEVEL_1, "interruptibility");
-        return;
+        vmx_inject_nmi(v);
     }
-
-    eflags = __vmread(GUEST_RFLAGS);
-    if ( irq_masked(eflags) )
+    else
     {
-        enable_irq_window(v);
-        return;
+        HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0);
+        vmx_inject_extint(v, intr_vector);
+        pt_intr_post(v, intr_vector, intr_source);
     }
 
-    intr_vector = cpu_get_interrupt(v, &intr_type);
-    BUG_ON(intr_vector < 0);
-
-    HVMTRACE_2D(INJ_VIRQ, v, intr_vector, /*fake=*/ 0);
-    vmx_inject_extint(v, intr_vector, VMX_DELIVER_NO_ERROR_CODE);
+    /* Is there another IRQ to queue up behind this one? */
+    intr_source = hvm_vcpu_has_pending_irq(v);
+    if ( unlikely(intr_source != hvm_intack_none) )
+        enable_intr_window(v, intr_source);
 
-    pt_intr_post(v, intr_vector, intr_type);
+ out:
+    update_tpr_threshold(vcpu_vlapic(v));
 }
 
 /*
diff -Naurp xen/arch/x86/hvm/vmx/vmcs.c xen-redhat/arch/x86/hvm/vmx/vmcs.c
--- xen/arch/x86/hvm/vmx/vmcs.c
+++ xen-redhat/arch/x86/hvm/vmx/vmcs.c
@@ -37,6 +37,9 @@
 #include <xen/keyhandler.h>
 #include <asm/shadow.h>
 
+static int opt_vpid_enabled = 1;
+boolean_param("vpid", opt_vpid_enabled);
+
 /* Dynamic (run-time adjusted) execution control flags. */
 u32 vmx_pin_based_exec_control __read_mostly;
 u32 vmx_cpu_based_exec_control __read_mostly;
@@ -64,7 +67,7 @@ static u32 adjust_vmx_controls(u32 ctl_m
 }
 
 #define vmx_has_secondary_exec_ctls \
-    (_vmx_cpu_based_exec_control & ACTIVATE_SECONDARY_CONTROLS)
+    (_vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
 
 void vmx_init_vmcs_config(void)
 {
@@ -75,21 +78,25 @@ void vmx_init_vmcs_config(void)
     u32 _vmx_vmexit_control;
     u32 _vmx_vmentry_control;
 
+    rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
+
     min = (PIN_BASED_EXT_INTR_MASK |
            PIN_BASED_NMI_EXITING);
-    opt = 0; /*PIN_BASED_VIRTUAL_NMIS*/
+    opt = PIN_BASED_VIRTUAL_NMIS;
     _vmx_pin_based_exec_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_PINBASED_CTLS_MSR);
 
     min = (CPU_BASED_HLT_EXITING |
            CPU_BASED_INVLPG_EXITING |
+           CPU_BASED_CR3_LOAD_EXITING |
+           CPU_BASED_CR3_STORE_EXITING |
            CPU_BASED_MWAIT_EXITING |
            CPU_BASED_MOV_DR_EXITING |
            CPU_BASED_ACTIVATE_IO_BITMAP |
            CPU_BASED_USE_TSC_OFFSETING);
-    opt = CPU_BASED_ACTIVATE_MSR_BITMAP;
-    opt |= CPU_BASED_TPR_SHADOW;
-    opt |= ACTIVATE_SECONDARY_CONTROLS;
+    opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
+           CPU_BASED_TPR_SHADOW |
+           CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
     _vmx_cpu_based_exec_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_PROCBASED_CTLS_MSR);
 #ifdef __x86_64__
@@ -107,24 +114,44 @@ void vmx_init_vmcs_config(void)
     if ( vmx_has_secondary_exec_ctls )
     {
         min = 0;
-        opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+        opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+          SECONDARY_EXEC_ENABLE_EPT);
+        if ( opt_vpid_enabled )
+            opt |= SECONDARY_EXEC_ENABLE_VPID;
         _vmx_secondary_exec_control = adjust_vmx_controls(
             min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
     }
 
+    if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
+    {
+        /*
+         * To use EPT we expect to be able to clear certain intercepts.
+         * We check VMX_BASIC_MSR[55] to correctly handle default1 controls.
+         */
+        uint32_t must_be_one, must_be_zero, msr = MSR_IA32_VMX_PROCBASED_CTLS_MSR;
+        if ( vmx_msr_high & (1u << 23) )
+            msr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS;
+        rdmsr(msr, must_be_one, must_be_zero);
+        if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
+                            CPU_BASED_CR3_LOAD_EXITING |
+                            CPU_BASED_CR3_STORE_EXITING) )
+            _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+    }
+
     min = VM_EXIT_ACK_INTR_ON_EXIT;
-    opt = 0;
+    opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT;
 #ifdef __x86_64__
     min |= VM_EXIT_IA32E_MODE;
 #endif
     _vmx_vmexit_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_EXIT_CTLS_MSR);
 
-    min = opt = 0;
+    min = 0;
+    opt = VM_ENTRY_LOAD_GUEST_PAT;
     _vmx_vmentry_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_ENTRY_CTLS_MSR);
 
-    rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
 
     if ( smp_processor_id() == 0 )
     {
@@ -205,34 +232,69 @@ static void vmx_load_vmcs(struct vcpu *v
     this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs;
 }
 
+struct foreign_vmcs {
+    struct vcpu *v;
+    unsigned int count;
+};
+static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs);
+
 void vmx_vmcs_enter(struct vcpu *v)
 {
+    struct foreign_vmcs *fv;
+
     /*
      * NB. We must *always* run an HVM VCPU on its own VMCS, except for
      * vmx_vmcs_enter/exit critical regions.
      */
-    if ( v == current )
+    if ( likely(v == current) )
         return;
 
-    vcpu_pause(v);
-    spin_lock(&v->arch.hvm_vmx.vmcs_lock);
+   fv = &this_cpu(foreign_vmcs);
 
-    vmx_clear_vmcs(v);
-    vmx_load_vmcs(v);
+   if ( fv->v == v )
+   {
+       BUG_ON(fv->count == 0);
+   }
+   else
+   {
+       BUG_ON(fv->v != NULL);
+       BUG_ON(fv->count != 0);
+
+       vcpu_pause(v);
+       spin_lock(&v->arch.hvm_vmx.vmcs_lock);
+
+       vmx_clear_vmcs(v);
+       vmx_load_vmcs(v);
+
+       fv->v = v;
+   }
+
+   fv->count++;
 }
 
 void vmx_vmcs_exit(struct vcpu *v)
 {
-    if ( v == current )
+   struct foreign_vmcs *fv;
+
+   if ( likely(v == current) )
         return;
 
-    /* Don't confuse vmx_do_resume (for @v or @current!) */
-    vmx_clear_vmcs(v);
-    if ( is_hvm_vcpu(current) )
-        vmx_load_vmcs(current);
+    fv = &this_cpu(foreign_vmcs);
+    BUG_ON(fv->v != v);
+    BUG_ON(fv->count == 0);
 
-    spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
-    vcpu_unpause(v);
+    if ( --fv->count == 0 )
+    {
+        /* Don't confuse vmx_do_resume (for @v or @current!) */
+        vmx_clear_vmcs(v);
+        if ( is_hvm_vcpu(current) )
+            vmx_load_vmcs(current);
+
+        spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
+        vcpu_unpause(v);
+
+        fv->v = NULL;
+    }
 }
 
 struct vmcs_struct *vmx_alloc_host_vmcs(void)
@@ -273,27 +335,14 @@ struct host_execution_env {
 
 static void vmx_set_host_env(struct vcpu *v)
 {
-    unsigned int tr, cpu;
-    struct host_execution_env host_env;
-    struct Xgt_desc_struct desc;
-
-    cpu = smp_processor_id();
-    __asm__ __volatile__ ("sidt  (%0) \n" :: "a"(&desc) : "memory");
-    host_env.idtr_limit = desc.size;
-    host_env.idtr_base = desc.address;
-    __vmwrite(HOST_IDTR_BASE, host_env.idtr_base);
-
-    __asm__ __volatile__ ("sgdt  (%0) \n" :: "a"(&desc) : "memory");
-    host_env.gdtr_limit = desc.size;
-    host_env.gdtr_base = desc.address;
-    __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base);
-
-    __asm__ __volatile__ ("str  (%0) \n" :: "a"(&tr) : "memory");
-    host_env.tr_selector = tr;
-    host_env.tr_limit = sizeof(struct tss_struct);
-    host_env.tr_base = (unsigned long) &init_tss[cpu];
-    __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector);
-    __vmwrite(HOST_TR_BASE, host_env.tr_base);
+    unsigned int cpu = smp_processor_id();
+
+    __vmwrite(HOST_GDTR_BASE,
+              (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY));
+    __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
+
+    __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
+    __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
 
     /*
      * Skip end of cpu_user_regs when entering the hypervisor because the
@@ -306,6 +355,7 @@ static void vmx_set_host_env(struct vcpu
 
 static void construct_vmcs(struct vcpu *v)
 {
+    struct domain *d = v->domain;
     unsigned long cr0, cr4;
     union vmcs_arbytes arbytes;
 
@@ -313,12 +363,31 @@ static void construct_vmcs(struct vcpu *
 
     /* VMCS controls. */
     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
+
+    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
+    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+    if ( paging_mode_hap(d) )
+    {
+        v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                          CPU_BASED_CR3_LOAD_EXITING |
+                                          CPU_BASED_CR3_STORE_EXITING);
+    }
+    else
+    {
+        v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+        vmx_vmexit_control &= ~(VM_EXIT_SAVE_GUEST_PAT |
+                                VM_EXIT_LOAD_HOST_PAT);
+        vmx_vmentry_control &= ~VM_ENTRY_LOAD_GUEST_PAT;
+    }
+
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
     __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
     __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
-    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
-    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
-    if ( vmx_cpu_based_exec_control & ACTIVATE_SECONDARY_CONTROLS )
-        __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control);
+
+    if ( cpu_has_vmx_secondary_exec_control )
+        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+          v->arch.hvm_vmx.secondary_exec_control);
 
     if ( cpu_has_vmx_msr_bitmap )
         __vmwrite(MSR_BITMAP, virt_to_maddr(vmx_msr_bitmap));
@@ -346,7 +415,7 @@ static void construct_vmcs(struct vcpu *
 
     /* Host control registers. */
     __vmwrite(HOST_CR0, read_cr0() | X86_CR0_TS);
-    __vmwrite(HOST_CR4, read_cr4());
+    __vmwrite(HOST_CR4, mmu_cr4_features);
 
     /* Host CS:RIP. */
     __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
@@ -428,7 +497,10 @@ static void construct_vmcs(struct vcpu *
     __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
 #endif
 
-    __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (1U << TRAP_page_fault));
+    if ( paging_mode_hap(d) )
+        __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK);
+    else
+        __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (1U << TRAP_page_fault));
 
     /* Guest CR0. */
     cr0 = read_cr0();
@@ -439,7 +511,14 @@ static void construct_vmcs(struct vcpu *
 
     /* Guest CR4. */
     cr4 = read_cr4();
-    __vmwrite(GUEST_CR4, cr4 & ~X86_CR4_PSE);
+    if ( paging_mode_hap(v->domain) )
+    {
+        hvm_update_guest_cr(v, 0);
+        hvm_update_guest_cr(v, 4);
+    }
+    else
+        __vmwrite(GUEST_CR4, cr4 & ~X86_CR4_PSE);
+
     v->arch.hvm_vmx.cpu_shadow_cr4 =
         cr4 & ~(X86_CR4_PGE | X86_CR4_VMXE | X86_CR4_PAE);
     __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
@@ -454,11 +533,34 @@ static void construct_vmcs(struct vcpu *
     }
 #endif
 
+    if ( paging_mode_hap(d) )
+    {
+        v->arch.hvm_vmx.ept_control.etmt = EPT_DEFAULT_MT;
+        v->arch.hvm_vmx.ept_control.gaw  = EPT_DEFAULT_GAW;
+        v->arch.hvm_vmx.ept_control.asr  =
+          pagetable_get_pfn(d->arch.phys_table);
+        __vmwrite(EPT_POINTER, v->arch.hvm_vmx.ept_control.eptp);
+#ifdef CONFIG_X86_PAE
+        __vmwrite(EPT_POINTER_HIGH, v->arch.hvm_vmx.ept_control.eptp >> 32);
+#endif
+    }
+
+    if ( cpu_has_vmx_vpid )
+    {
+        v->arch.hvm_vmx.vpid = v->vcpu_id +
+          v->domain->arch.hvm_domain.vmx_vpid_base;
+        __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vmx.vpid);
+    }
+
     /* Memory-mapped based VLAPIC TPR optimization. */
     if ( cpu_has_vmx_mmap_vtpr_optimization )
     {
         __vmwrite(VIRTUAL_APIC_PAGE_ADDR,
                     page_to_maddr(vcpu_vlapic(v)->regs_page));
+#if defined (CONFIG_X86_PAE)
+        __vmwrite(VIRTUAL_APIC_PAGE_ADDR_HIGH,
+                    page_to_maddr(vcpu_vlapic(v)->regs_page) >> 32);
+#endif
         __vmwrite(TPR_THRESHOLD, 0);
 
         vcpu_vlapic(v)->mmap_vtpr_enabled = 1;
@@ -471,6 +573,21 @@ static void construct_vmcs(struct vcpu *
     __vmwrite(GUEST_TR_BASE, 0);
     __vmwrite(GUEST_TR_LIMIT, 0xff);
 
+    if ( cpu_has_vmx_pat && paging_mode_hap(d) )
+    {
+        u64 host_pat, guest_pat;
+
+        rdmsrl(MSR_IA32_CR_PAT, host_pat);
+        guest_pat = 0x7040600070406ULL;
+
+        __vmwrite(HOST_PAT, host_pat);
+        __vmwrite(GUEST_PAT, guest_pat);
+#ifdef __i386__
+        __vmwrite(HOST_PAT_HIGH, host_pat >> 32);
+        __vmwrite(GUEST_PAT_HIGH, guest_pat >> 32);
+#endif
+    }
+
     vmx_vmcs_exit(v);
 
     paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
@@ -533,6 +650,7 @@ void vmx_do_resume(struct vcpu *v)
         vmx_load_vmcs(v);
         hvm_migrate_timers(v);
         vmx_set_host_env(v);
+        vpid_sync_vcpu_all(v);
     }
 
     if ( !v->arch.hvm_vmx.launched && vcpu_vlapic(v)->mmap_vtpr_enabled )
@@ -545,6 +663,9 @@ void vmx_do_resume(struct vcpu *v)
             domain_crash_synchronous();
         }
         __vmwrite(APIC_ACCESS_ADDR, page_to_maddr(pg));
+#if defined (CONFIG_X86_PAE)
+        __vmwrite(APIC_ACCESS_ADDR_HIGH, page_to_maddr(pg) >> 32);
+#endif
     }
 
     debug_state = v->domain->debugger_attached;
@@ -564,55 +685,142 @@ void vmx_do_resume(struct vcpu *v)
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
-/* Dump a section of VMCS */
-static void print_section(char *header, uint32_t start, 
-                          uint32_t end, int incr)
+static void vmx_dump_sel(char *name, enum x86_segment seg)
 {
-    uint32_t addr, j;
-    unsigned long val;
-    int code, rc;
-    char *fmt[4] = {"0x%04lx ", "0x%016lx ", "0x%08lx ", "0x%016lx "};
-    char *err[4] = {"------ ", "------------------ ", 
-                    "---------- ", "------------------ "};
-
-    /* Find width of the field (encoded in bits 14:13 of address) */
-    code = (start>>13)&3;
-
-    if (header)
-        printk("\t %s", header);
-
-    for (addr=start, j=0; addr<=end; addr+=incr, j++) {
-
-        if (!(j&3))
-            printk("\n\t\t0x%08x: ", addr);
-
-        val = __vmread_safe(addr, &rc);
-        if (rc == 0)
-            printk(fmt[code], val);
-        else
-            printk("%s", err[code]);
-    }
+    struct segment_register sreg;
+    hvm_get_segment_register(current, seg, &sreg);
+    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
+           name, sreg.sel, sreg.attr.bytes, sreg.limit,
+           (unsigned long long)sreg.base);
+}
 
-    printk("\n");
+static unsigned long vmr(unsigned long field)
+{
+    int rc;
+    unsigned long val;
+    val = __vmread_safe(field, &rc);
+    return rc ? 0 : val;
 }
 
-/* Dump current VMCS */
 void vmcs_dump_vcpu(void)
 {
-    print_section("16-bit Guest-State Fields", 0x800, 0x80e, 2);
-    print_section("16-bit Host-State Fields", 0xc00, 0xc0c, 2);
-    print_section("64-bit Control Fields", 0x2000, 0x2013, 1);
-    print_section("64-bit Guest-State Fields", 0x2800, 0x2803, 1);
-    print_section("32-bit Control Fields", 0x4000, 0x401c, 2);
-    print_section("32-bit RO Data Fields", 0x4400, 0x440e, 2);
-    print_section("32-bit Guest-State Fields", 0x4800, 0x482a, 2);
-    print_section("32-bit Host-State Fields", 0x4c00, 0x4c00, 2);
-    print_section("Natural 64-bit Control Fields", 0x6000, 0x600e, 2);
-    print_section("64-bit RO Data Fields", 0x6400, 0x640A, 2);
-    print_section("Natural 64-bit Guest-State Fields", 0x6800, 0x6826, 2);
-    print_section("Natural 64-bit Host-State Fields", 0x6c00, 0x6c16, 2);
-}
+    unsigned long long x;
 
+    printk("*** Guest State ***\n");
+    printk("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
+           (unsigned long long)vmr(GUEST_CR0),
+           (unsigned long long)vmr(CR0_READ_SHADOW), 
+           (unsigned long long)vmr(CR0_GUEST_HOST_MASK));
+    printk("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
+           (unsigned long long)vmr(GUEST_CR4),
+           (unsigned long long)vmr(CR4_READ_SHADOW), 
+           (unsigned long long)vmr(CR4_GUEST_HOST_MASK));
+    printk("CR3: actual=0x%016llx, target_count=%d\n",
+           (unsigned long long)vmr(GUEST_CR3),
+           (int)vmr(CR3_TARGET_COUNT));
+    printk("     target0=%016llx, target1=%016llx\n",
+           (unsigned long long)vmr(CR3_TARGET_VALUE0),
+           (unsigned long long)vmr(CR3_TARGET_VALUE1));
+    printk("     target2=%016llx, target3=%016llx\n",
+           (unsigned long long)vmr(CR3_TARGET_VALUE2),
+           (unsigned long long)vmr(CR3_TARGET_VALUE3));
+    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
+           (unsigned long long)vmr(GUEST_RSP),
+           (unsigned long long)vmr(GUEST_RIP));
+    printk("RFLAGS=0x%016llx  DR7 = 0x%016llx\n", 
+           (unsigned long long)vmr(GUEST_DR7),
+           (unsigned long long)vmr(GUEST_RFLAGS));
+    printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
+           (unsigned long long)vmr(GUEST_SYSENTER_ESP),
+           (int)vmr(GUEST_SYSENTER_CS),
+           (unsigned long long)vmr(GUEST_SYSENTER_EIP));
+    vmx_dump_sel("CS", x86_seg_cs);
+    vmx_dump_sel("DS", x86_seg_ds);
+    vmx_dump_sel("SS", x86_seg_ss);
+    vmx_dump_sel("ES", x86_seg_es);
+    vmx_dump_sel("FS", x86_seg_fs);
+    vmx_dump_sel("GS", x86_seg_gs);
+    vmx_dump_sel("GDTR", x86_seg_gdtr);
+    vmx_dump_sel("LDTR", x86_seg_ldtr);
+    vmx_dump_sel("IDTR", x86_seg_idtr);
+    vmx_dump_sel("TR", x86_seg_tr);
+    x  = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
+    x |= (uint32_t)vmr(TSC_OFFSET);
+    printk("TSC Offset = %016llx\n", x);
+    x  = (unsigned long long)vmr(GUEST_IA32_DEBUGCTL) << 32;
+    x |= (uint32_t)vmr(GUEST_IA32_DEBUGCTL);
+    printk("DebugCtl=%016llx DebugExceptions=%016llx\n", x,
+           (unsigned long long)vmr(GUEST_PENDING_DBG_EXCEPTIONS));
+    printk("Interruptibility=%04x ActivityState=%04x\n",
+           (int)vmr(GUEST_INTERRUPTIBILITY_INFO),
+           (int)vmr(GUEST_ACTIVITY_STATE));
+
+    printk("*** Host State ***\n");
+    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
+           (unsigned long long)vmr(HOST_RSP),
+           (unsigned long long)vmr(HOST_RIP));
+    printk("CS=%04x DS=%04x ES=%04x FS=%04x GS=%04x SS=%04x TR=%04x\n",
+           (uint16_t)vmr(HOST_CS_SELECTOR),
+           (uint16_t)vmr(HOST_DS_SELECTOR),
+           (uint16_t)vmr(HOST_ES_SELECTOR),
+           (uint16_t)vmr(HOST_FS_SELECTOR),
+           (uint16_t)vmr(HOST_GS_SELECTOR),
+           (uint16_t)vmr(HOST_SS_SELECTOR),
+           (uint16_t)vmr(HOST_TR_SELECTOR));
+    printk("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n",
+           (unsigned long long)vmr(HOST_FS_BASE),
+           (unsigned long long)vmr(HOST_GS_BASE),
+           (unsigned long long)vmr(HOST_TR_BASE));
+    printk("GDTBase=%016llx IDTBase=%016llx\n",
+           (unsigned long long)vmr(HOST_GDTR_BASE),
+           (unsigned long long)vmr(HOST_IDTR_BASE));
+    printk("CR0=%016llx CR3=%016llx CR4=%016llx\n",
+           (unsigned long long)vmr(HOST_CR0),
+           (unsigned long long)vmr(HOST_CR3),
+           (unsigned long long)vmr(HOST_CR4));
+    printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
+           (unsigned long long)vmr(HOST_IA32_SYSENTER_ESP),
+           (int)vmr(HOST_IA32_SYSENTER_CS),
+           (unsigned long long)vmr(HOST_IA32_SYSENTER_EIP));
+
+    printk("*** Control State ***\n");
+    printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+           (uint32_t)vmr(PIN_BASED_VM_EXEC_CONTROL),
+           (uint32_t)vmr(CPU_BASED_VM_EXEC_CONTROL),
+           (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL));
+    printk("EntryControls=%08x ExitControls=%08x\n",
+           (uint32_t)vmr(VM_ENTRY_CONTROLS),
+           (uint32_t)vmr(VM_EXIT_CONTROLS));
+    printk("ExceptionBitmap=%08x\n",
+           (uint32_t)vmr(EXCEPTION_BITMAP));
+    printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+           (uint32_t)vmr(VM_ENTRY_INTR_INFO_FIELD),
+           (uint32_t)vmr(VM_ENTRY_EXCEPTION_ERROR_CODE),
+           (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
+    printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+           (uint32_t)vmr(VM_EXIT_INTR_INFO),
+           (uint32_t)vmr(VM_EXIT_INTR_ERROR_CODE),
+           (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
+    printk("        reason=%08x qualification=%08x\n",
+           (uint32_t)vmr(VM_EXIT_REASON),
+           (uint32_t)vmr(EXIT_QUALIFICATION));
+    printk("IDTVectoring: info=%08x errcode=%08x\n",
+           (uint32_t)vmr(IDT_VECTORING_INFO_FIELD),
+           (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
+    printk("TPR Threshold = 0x%02x\n",
+           (uint32_t)vmr(TPR_THRESHOLD));
+    printk("secondary exec control = 0x%08x\n",
+      (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL));
+    printk("Guest PAT = 0x%08x%08x\n",
+           (uint32_t)vmr(GUEST_PAT_HIGH), (uint32_t)vmr(GUEST_PAT));
+    printk("Host PAT = 0x%08x%08x\n",
+           (uint32_t)vmr(HOST_PAT_HIGH), (uint32_t)vmr(HOST_PAT));
+    printk("EPT pointer = 0x%08x%08x\n",
+      (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
+    printk("virtual processor ID = 0x%04x\n",
+      (uint32_t)vmr(VIRTUAL_PROCESSOR_ID));
+
+}
 
 static void vmcs_dump(unsigned char ch)
 {
diff -Naurp xen/arch/x86/hvm/vmx/vmx.c xen-redhat/arch/x86/hvm/vmx/vmx.c
--- xen/arch/x86/hvm/vmx/vmx.c
+++ xen-redhat/arch/x86/hvm/vmx/vmx.c
@@ -50,11 +50,25 @@
 #include <asm/hvm/vpt.h>
 #include <public/hvm/save.h>
 #include <asm/hvm/trace.h>
+#include <asm/paging.h>
+#include <asm/debugger.h>
 
 char *vmx_msr_bitmap;
 
 static void vmx_ctxt_switch_from(struct vcpu *v);
 static void vmx_ctxt_switch_to(struct vcpu *v);
+static int  vmx_alloc_vpid(struct domain *d);
+static void vmx_free_vpid(struct domain *d);
+
+static int vmx_domain_initialise(struct domain *d)
+{
+    return vmx_alloc_vpid(d);
+}
+
+static void vmx_domain_destroy(struct domain *d)
+{
+    vmx_free_vpid(d);
+}
 
 static int vmx_vcpu_initialise(struct vcpu *v)
 {
@@ -79,6 +93,7 @@ static int vmx_vcpu_initialise(struct vc
 
 static void vmx_vcpu_destroy(struct vcpu *v)
 {
+    ept_sync_all();
     vmx_destroy_vmcs(v);
 }
 
@@ -176,20 +191,14 @@ static int long_mode_do_msr_read(struct 
 
     case MSR_FS_BASE:
         msr_content = __vmread(GUEST_FS_BASE);
-        goto check_long_mode;
+        break;
 
     case MSR_GS_BASE:
         msr_content = __vmread(GUEST_GS_BASE);
-        goto check_long_mode;
+        break;
 
     case MSR_SHADOW_GS_BASE:
-        msr_content = v->arch.hvm_vmx.shadow_gs;
-    check_long_mode:
-        if ( !(vmx_long_mode_enabled(v)) )
-        {
-            vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
-            return 0;
-        }
+        rdmsrl(MSR_SHADOW_GS_BASE, msr_content);
         break;
 
     case MSR_STAR:
@@ -274,9 +283,6 @@ static int long_mode_do_msr_write(struct
     case MSR_FS_BASE:
     case MSR_GS_BASE:
     case MSR_SHADOW_GS_BASE:
-        if ( !vmx_long_mode_enabled(v) )
-            goto gp_fault;
-
         if ( !is_canonical_address(msr_content) )
             goto uncanonical_address;
 
@@ -285,10 +291,7 @@ static int long_mode_do_msr_write(struct
         else if ( ecx == MSR_GS_BASE )
             __vmwrite(GUEST_GS_BASE, msr_content);
         else
-        {
-            v->arch.hvm_vmx.shadow_gs = msr_content;
             wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
-        }
 
         break;
 
@@ -346,7 +349,10 @@ static void vmx_restore_host_msrs(void)
 
 static void vmx_save_guest_msrs(struct vcpu *v)
 {
-    /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
+    /*
+     * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
+     * be updated at any time via SWAPGS, which we cannot trap.
+     */
     rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
 }
 
@@ -607,28 +613,109 @@ void vmx_vmcs_save(struct vcpu *v, struc
     c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
     c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
 
-    /*
-     * Save any event/interrupt that was being injected when we last
-     * exited. IDT_VECTORING_INFO_FIELD has priority, as anything in
-     * VM_ENTRY_INTR_INFO_FIELD is either a fault caused by the first
-     * event, which will happen the next time, or an interrupt, which we
-     * never inject when IDT_VECTORING_INFO_FIELD is valid.
-     */
-    if ( (ev = __vmread(IDT_VECTORING_INFO_FIELD)) & INTR_INFO_VALID_MASK )
-    {
-        c->pending_event = ev;
-        c->error_code = __vmread(IDT_VECTORING_ERROR_CODE);
-    }
-    else if ( (ev = __vmread(VM_ENTRY_INTR_INFO_FIELD)) &
-              INTR_INFO_VALID_MASK )
+    c->pending_event = 0;
+    c->error_code = 0;
+    if ( ((ev = __vmread(VM_ENTRY_INTR_INFO_FIELD)) & INTR_INFO_VALID_MASK) &&
+         vmx_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
     {
         c->pending_event = ev;
         c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
     }
-    else
+
+    vmx_vmcs_exit(v);
+}
+
+/* the caller needs to check if the guest is switching to PAE mode */
+static void vmx_load_pdptrs(struct vcpu *v)
+{
+    uint64_t *guest_pdptrs;
+    unsigned long cr3 = v->arch.hvm_vmx.cpu_cr3, mfn;
+    char *p;
+
+    if ( cr3 & 0x1fUL )
+        goto crash;
+
+    mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT));
+    p = map_domain_page(mfn);
+    guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
+
+    vmx_vmcs_enter(v);
+
+    __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
+    __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
+    __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
+    __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
+#ifdef __i386__
+    __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
+    __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
+    __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
+    __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
+#endif
+
+    vmx_vmcs_exit(v);
+    unmap_domain_page(p);
+    return;
+
+crash:
+    domain_crash(v->domain);
+}
+
+static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
+{
+    unsigned long cr4;
+
+    if ( !hap_enabled(v->domain) )
+        return;
+
+    ASSERT((v == current) || !vcpu_runnable(v));
+
+    vmx_vmcs_enter(v);
+
+    switch (cr)
     {
-        c->pending_event = 0;
-        c->error_code = 0;
+    case 0:
+        if ( vmx_paging_enabled(v) )
+            v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+                                              CPU_BASED_CR3_STORE_EXITING);
+        else
+            v->arch.hvm_vmx.exec_control |= (CPU_BASED_CR3_LOAD_EXITING |
+                                             CPU_BASED_CR3_STORE_EXITING);
+
+        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+        break;
+
+    case 3:
+        if ( vmx_paging_enabled(v) )
+        {
+            if ( vmx_pae_enabled(v) && !vmx_long_mode_enabled(v) )
+                vmx_load_pdptrs(v);
+            __vmwrite(GUEST_CR3, v->arch.hvm_vmx.cpu_cr3);
+        }
+        else
+            __vmwrite(GUEST_CR3, HVM_IDENT_PT_PAGE);
+        vpid_sync_vcpu_all(v);
+        break;
+
+    case 4:
+        if ( vmx_paging_enabled(v) )
+        {
+            cr4 = HVM_CR4_HOST_MASK & ~X86_CR4_PAE;
+            cr4 |= v->arch.hvm_vmx.cpu_shadow_cr4;
+            if ( vmx_pae_enabled(v) && !vmx_long_mode_enabled(v) )
+                    vmx_load_pdptrs(v);
+        }
+        else
+        {
+            cr4 = __vmread(GUEST_CR4) | HVM_CR4_HOST_MASK;
+            cr4 |= X86_CR4_PSE;
+            cr4 &= ~X86_CR4_PAE;
+        }
+
+        __vmwrite(GUEST_CR4, cr4);
+        break;
+
+    default:
+        BUG();
     }
 
     vmx_vmcs_exit(v);
@@ -636,7 +723,7 @@ void vmx_vmcs_save(struct vcpu *v, struc
 
 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
 {
-    unsigned long mfn, old_base_mfn;
+    unsigned long mfn = 0, old_base_mfn;
 
     vmx_vmcs_enter(v);
 
@@ -645,8 +732,13 @@ int vmx_vmcs_restore(struct vcpu *v, str
     __vmwrite(GUEST_RFLAGS, c->rflags);
 
     v->arch.hvm_vmx.cpu_cr0 = (c->cr0 | X86_CR0_PE | X86_CR0_PG |
-                               X86_CR0_NE | X86_CR0_WP | X86_CR0_ET);
+                               X86_CR0_NE | X86_CR0_ET);
+
+    if ( paging_mode_shadow(v->domain) )
+        v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_WP;
+
     __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
+
     v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
     __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
 
@@ -659,7 +751,7 @@ int vmx_vmcs_restore(struct vcpu *v, str
            __func__, c->cr3, c->cr0, c->cr4);
 #endif
 
-    if ( !vmx_paging_enabled(v) )
+    if ( !vmx_paging_enabled(v) || paging_mode_hap(v->domain) )
     {
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "%s: paging not enabled.", __func__);
         goto skip_cr3;
@@ -686,10 +778,14 @@ int vmx_vmcs_restore(struct vcpu *v, str
     if ( vmx_long_mode_enabled(v) )
         vmx_enable_long_mode(v);
 
-    __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
     v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
     __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
 
+    if ( paging_mode_shadow(v->domain) )
+        __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
+    else
+        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
+
     __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
     __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
 
@@ -746,36 +842,18 @@ int vmx_vmcs_restore(struct vcpu *v, str
 
     paging_update_paging_modes(v);
 
-    if ( c->pending_valid )
+    if ( paging_mode_hap(v->domain) )
     {
-        vmx_vmcs_enter(v);
+        vmx_update_guest_cr(v, 0);
+        vmx_update_guest_cr(v, 3);
+        vmx_update_guest_cr(v, 4);
+    }
 
+    if ( c->pending_valid )
+    {
         gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
                  c->pending_event, c->error_code);
 
-        /* SVM uses type 3 ("Exception") for #OF and #BP; VMX uses type 6 */
-        if ( (c->pending_type == 3) &&
-             ((c->pending_vector == 3) || (c->pending_vector == 4)) )
-            c->pending_type = 6;
-
-        /* For software exceptions, we need to tell the hardware the
-         * instruction length as well (hmmm). */
-        if ( c->pending_type > 4 )
-        {
-            int addrbytes, ilen;
-            if ( (c->cs_arbytes & X86_SEG_AR_CS_LM_ACTIVE) &&
-                 (c->msr_efer & EFER_LMA) )
-                addrbytes = 8;
-            else if ( c->cs_arbytes & X86_SEG_AR_DEF_OP_SIZE )
-                addrbytes = 4;
-            else
-                addrbytes = 2;
-
-            ilen = hvm_instruction_fetch(c->rip, addrbytes, NULL);
-            __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen);
-        }
-
-        /* Sanity check */
         if ( (c->pending_type == 1) || (c->pending_type > 6) ||
              (c->pending_reserved != 0) )
         {
@@ -784,12 +862,13 @@ int vmx_vmcs_restore(struct vcpu *v, str
             return -EINVAL;
         }
 
-        /* Re-inject the exception */
-        __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event);
-        __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
-        v->arch.hvm_vmx.vector_injected = 1;
-
-        vmx_vmcs_exit(v);
+        if ( vmx_event_needs_reinjection(c->pending_type, c->pending_vector) )
+        {
+            vmx_vmcs_enter(v);
+            __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event);
+            __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
+            vmx_vmcs_exit(v);
+        }
     }
 
     return 0;
@@ -825,7 +904,7 @@ static void vmx_save_cpu_state(struct vc
     data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
 #endif
 
-    data->tsc = hvm_get_guest_time(v);
+    data->tsc = hvm_get_guest_tsc(v);
 
     dump_msr_state(guest_state);
 }
@@ -847,7 +926,7 @@ static void vmx_load_cpu_state(struct vc
 
     v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
 
-    hvm_set_guest_time(v, data->tsc);
+    hvm_set_guest_tsc(v, data->tsc);
 
     dump_msr_state(guest_state);
 }
@@ -882,6 +961,10 @@ static void vmx_ctxt_switch_from(struct 
 
 static void vmx_ctxt_switch_to(struct vcpu *v)
 {
+    /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
+    if ( unlikely(read_cr4() != mmu_cr4_features) )
+        write_cr4(mmu_cr4_features);
+
     vmx_restore_guest_msrs(v);
     vmx_restore_dr(v);
 }
@@ -1005,7 +1088,7 @@ static unsigned long vmx_get_segment_bas
 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
-    u16 attr = 0;
+    uint32_t attr = 0;
 
     ASSERT(v == current);
 
@@ -1074,6 +1157,133 @@ static void vmx_get_segment_register(str
     reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
 }
 
+static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
+                                     struct segment_register *reg)
+{
+    uint32_t attr;
+
+    ASSERT(v == current);
+
+    attr = reg->attr.bytes;
+    attr = ((attr & 0xf00) << 4) | (attr & 0xff);
+
+    /* Not-present must mean unusable. */
+    if ( !reg->attr.fields.p )
+        attr |= (1u << 16);
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        __vmwrite(GUEST_CS_SELECTOR, reg->sel);
+        __vmwrite(GUEST_CS_LIMIT, reg->limit);
+        __vmwrite(GUEST_CS_BASE, reg->base);
+        __vmwrite(GUEST_CS_AR_BYTES, attr);
+        guest_cpu_user_regs()->cs = reg->sel;
+        break;
+    case x86_seg_ds:
+        __vmwrite(GUEST_DS_SELECTOR, reg->sel);
+        __vmwrite(GUEST_DS_LIMIT, reg->limit);
+        __vmwrite(GUEST_DS_BASE, reg->base);
+        __vmwrite(GUEST_DS_AR_BYTES, attr);
+        break;
+    case x86_seg_es:
+        __vmwrite(GUEST_ES_SELECTOR, reg->sel);
+        __vmwrite(GUEST_ES_LIMIT, reg->limit);
+        __vmwrite(GUEST_ES_BASE, reg->base);
+        __vmwrite(GUEST_ES_AR_BYTES, attr);
+        break;
+    case x86_seg_fs:
+        __vmwrite(GUEST_FS_SELECTOR, reg->sel);
+        __vmwrite(GUEST_FS_LIMIT, reg->limit);
+        __vmwrite(GUEST_FS_BASE, reg->base);
+        __vmwrite(GUEST_FS_AR_BYTES, attr);
+        break;
+    case x86_seg_gs:
+        __vmwrite(GUEST_GS_SELECTOR, reg->sel);
+        __vmwrite(GUEST_GS_LIMIT, reg->limit);
+        __vmwrite(GUEST_GS_BASE, reg->base);
+        __vmwrite(GUEST_GS_AR_BYTES, attr);
+        break;
+    case x86_seg_ss:
+        __vmwrite(GUEST_SS_SELECTOR, reg->sel);
+        __vmwrite(GUEST_SS_LIMIT, reg->limit);
+        __vmwrite(GUEST_SS_BASE, reg->base);
+        __vmwrite(GUEST_SS_AR_BYTES, attr);
+        guest_cpu_user_regs()->ss = reg->sel;
+        break;
+    case x86_seg_tr:
+        __vmwrite(GUEST_TR_SELECTOR, reg->sel);
+        __vmwrite(GUEST_TR_LIMIT, reg->limit);
+        __vmwrite(GUEST_TR_BASE, reg->base);
+        __vmwrite(GUEST_TR_AR_BYTES, attr);
+        break;
+    case x86_seg_gdtr:
+        __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
+        __vmwrite(GUEST_GDTR_BASE, reg->base);
+        break;
+    case x86_seg_idtr:
+        __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
+        __vmwrite(GUEST_IDTR_BASE, reg->base);
+        break;
+    case x86_seg_ldtr:
+        __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
+        __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
+        __vmwrite(GUEST_LDTR_BASE, reg->base);
+        __vmwrite(GUEST_LDTR_AR_BYTES, attr);
+        break;
+    default:
+        BUG();
+    }
+}
+
+static int vmx_set_cr3(unsigned long value)
+{
+    struct vcpu *v = current;
+    unsigned long mfn, old_base_mfn;
+
+    /*
+     * If paging is not enabled yet, simply copy the value to CR3.
+     */
+    if ( !vmx_paging_enabled(v) || paging_mode_hap(v->domain) )
+    {
+	v->arch.hvm_vmx.cpu_cr3 = value;
+	return X86EMUL_OKAY;
+    }
+
+    /*
+     * We make a new one if the shadow does not exist.
+     */
+    if ( value == v->arch.hvm_vmx.cpu_cr3 ) {
+	/*
+	 * This is simple TLB flush, implying the guest has
+	 * removed some translation or changed page attributes.
+	 * We simply invalidate the shadow.
+	 */
+	mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
+	if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
+	    return X86EMUL_UNHANDLEABLE;
+	paging_update_cr3(v);
+    } else {
+	/*
+	 * If different, make a shadow. Check if the PDBR is valid
+	 * first.
+	 */
+	HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
+	mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
+	if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
+	    return X86EMUL_UNHANDLEABLE;
+	old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+	v->arch.guest_table = pagetable_from_pfn(mfn);
+	if ( old_base_mfn )
+	    put_page(mfn_to_page(old_base_mfn));
+	v->arch.hvm_vmx.cpu_cr3 = value;
+	update_cr3(v);
+	HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
+    }
+
+    return X86EMUL_OKAY;
+}
+
 /* Make sure that xen intercepts any FP accesses from current */
 static void vmx_stts(struct vcpu *v)
 {
@@ -1135,16 +1345,29 @@ static void vmx_init_hypercall_page(stru
     *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
 }
 
-static int vmx_interrupts_enabled(struct vcpu *v) 
+static int vmx_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
 {
-    unsigned long eflags = __vmread(GUEST_RFLAGS); 
-    return !irq_masked(eflags); 
+    unsigned long intr_shadow, eflags;
+
+    ASSERT(v == current);
+
+    intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
+
+    if ( type == hvm_intack_nmi )
+        return !(intr_shadow & (VMX_INTR_SHADOW_STI|
+                                VMX_INTR_SHADOW_MOV_SS|
+                                VMX_INTR_SHADOW_NMI));
+
+    ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
+    eflags = __vmread(GUEST_RFLAGS);
+    return ((eflags & X86_EFLAGS_IF) &&
+            !(intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS)));
 }
 
 
 static void vmx_update_host_cr3(struct vcpu *v)
 {
-    ASSERT( (v == current) || !vcpu_runnable(v) );
+    ASSERT((v == current) || !vcpu_runnable(v));
     vmx_vmcs_enter(v);
     __vmwrite(HOST_CR3, v->arch.cr3);
     vmx_vmcs_exit(v);
@@ -1152,17 +1375,22 @@ static void vmx_update_host_cr3(struct v
 
 static void vmx_update_guest_cr3(struct vcpu *v)
 {
-    ASSERT( (v == current) || !vcpu_runnable(v) );
+    ASSERT((v == current) || !vcpu_runnable(v));
     vmx_vmcs_enter(v);
     __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+    vpid_sync_vcpu_all(v);
     vmx_vmcs_exit(v);
 }
 
 static void vmx_flush_guest_tlbs(void)
 {
-    /* No tagged TLB support on VMX yet.  The fact that we're in Xen
-     * at all means any guest will have a clean TLB when it's next run,
-     * because VMRESUME will flush it for us. */
+    /* If VPID (i.e. tagged TLB support) is not enabled, the fact that
+     * we're in Xen at all means any guest will have a clean TLB when
+     * it's next run, because VMRESUME will flush it for us. 
+     * 
+     * If enabled, we invalidate all translations associated with all
+     * VPID values */
+    vpid_sync_all();
 }
 
 static void vmx_inject_exception(
@@ -1179,14 +1407,10 @@ static void vmx_update_vtpr(struct vcpu 
     /* VMX doesn't have a V_TPR field */
 }
 
-static int vmx_event_injection_faulted(struct vcpu *v)
+static int vmx_event_pending(struct vcpu *v)
 {
-    unsigned int idtv_info_field;
-
     ASSERT(v == current);
-
-    idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
-    return (idtv_info_field & INTR_INFO_VALID_MASK);
+    return (__vmread(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK);
 }
 
 static void disable_intercept_for_msr(u32 msr)
@@ -1212,6 +1436,8 @@ static void disable_intercept_for_msr(u3
 static struct hvm_function_table vmx_function_table = {
     .name                 = "VMX",
     .disable              = stop_vmx,
+    .domain_initialise    = vmx_domain_initialise,
+    .domain_destroy       = vmx_domain_destroy,
     .vcpu_initialise      = vmx_vcpu_initialise,
     .vcpu_destroy         = vmx_vcpu_destroy,
     .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
@@ -1227,8 +1453,10 @@ static struct hvm_function_table vmx_fun
     .get_guest_ctrl_reg   = vmx_get_ctrl_reg,
     .get_segment_base     = vmx_get_segment_base,
     .get_segment_register = vmx_get_segment_register,
+    .set_segment_register = vmx_set_segment_register,
     .update_host_cr3      = vmx_update_host_cr3,
     .update_guest_cr3     = vmx_update_guest_cr3,
+    .set_cr3              = vmx_set_cr3,
     .flush_guest_tlbs     = vmx_flush_guest_tlbs,
     .update_vtpr          = vmx_update_vtpr,
     .stts                 = vmx_stts,
@@ -1236,9 +1464,13 @@ static struct hvm_function_table vmx_fun
     .inject_exception     = vmx_inject_exception,
     .init_ap_context      = vmx_init_ap_context,
     .init_hypercall_page  = vmx_init_hypercall_page,
-    .event_injection_faulted = vmx_event_injection_faulted
+    .event_pending        = vmx_event_pending,
+    .update_guest_cr      = vmx_update_guest_cr
 };
 
+static unsigned long *vpid_bitmap;
+#define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / MAX_VIRT_CPUS)
+
 int start_vmx(void)
 {
     u32 eax, edx;
@@ -1291,6 +1523,26 @@ int start_vmx(void)
         return 0;
     }
 
+    vmx_function_table.hap_supported = cpu_has_vmx_ept;
+    vmx_function_table.hap_1gb_pgtb = 0;
+
+    ept_sync_all();
+
+    vpid_sync_all();
+
+    if ( cpu_has_vmx_vpid )
+    {
+        printk("VMX: VPID is available.\n");
+
+        vpid_bitmap = xmalloc_array(
+          unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE));
+        BUG_ON(vpid_bitmap == NULL);
+        memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long));
+
+        /* VPID 0 is used by VMX root mode (the hypervisor). */
+        __set_bit(0, vpid_bitmap);
+    }
+
     vmx_save_host_msrs();
 
     if ( smp_processor_id() != 0 )
@@ -1311,11 +1563,44 @@ int start_vmx(void)
         disable_intercept_for_msr(MSR_IA32_SYSENTER_CS);
         disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP);
         disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP);
+        if ( cpu_has_vmx_pat && 
+                    vmx_function_table.hap_supported )
+            disable_intercept_for_msr(MSR_IA32_CR_PAT);
     }
 
     return 1;
 }
 
+static int vmx_alloc_vpid(struct domain *d)
+{
+    int idx;
+
+    if ( !cpu_has_vmx_vpid )
+        return 0;
+
+    do {
+        idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE);
+        if ( idx >= VPID_BITMAP_SIZE )
+        {
+            dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n");
+            return -EBUSY;
+        }
+    }
+    while ( test_and_set_bit(idx, vpid_bitmap) );
+
+    d->arch.hvm_domain.vmx_vpid_base = idx * MAX_VIRT_CPUS;
+    return 0;
+}
+
+static void vmx_free_vpid(struct domain *d)
+{
+    if ( !cpu_has_vmx_vpid )
+        return;
+
+    clear_bit(d->arch.hvm_domain.vmx_vpid_base / MAX_VIRT_CPUS, vpid_bitmap);
+}
+
+
 /*
  * Not all cases receive valid value in the VM-exit instruction length field.
  * Callers must know what they're doing!
@@ -1391,7 +1676,7 @@ static void vmx_do_cpuid(struct cpu_user
         if ( (value & 7) || (mfn == INVALID_MFN) ||
              !v->arch.hvm_vmx.vmxassist_enabled )
         {
-            domain_crash(v->domain);
+            hvm_inject_exception(TRAP_gp_fault, 0, 0);
             return;
         }
 
@@ -1405,6 +1690,10 @@ static void vmx_do_cpuid(struct cpu_user
     } else {
         hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
 
+	/* don't support features > 0xa */
+	if (unlikely(input == 0x0) && eax > 0xa)
+		eax = 0xa;
+
         if ( input == 0x00000001 )
         {
             /* Mask off reserved bits. */
@@ -1416,14 +1705,17 @@ static void vmx_do_cpuid(struct cpu_user
             ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
                      bitmaskof(X86_FEATURE_EST)  |
                      bitmaskof(X86_FEATURE_TM2)  |
-                     bitmaskof(X86_FEATURE_CID));
+                     bitmaskof(X86_FEATURE_CID)  |
+                     bitmaskof(X86_FEATURE_XSAVE)|
+                     bitmaskof(X86_FEATURE_OSXSAVE));
 
             edx &= ~(bitmaskof(X86_FEATURE_HT)   |
                      bitmaskof(X86_FEATURE_ACPI) |
                      bitmaskof(X86_FEATURE_ACC));
         }
 
-        if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
+        if ( input == 0x00000006 || input == 0x00000009 ||
+		input == 0x0000000A || input == 0x0000000B )
             eax = ebx = ecx = edx = 0x0;
     }
 
@@ -1488,7 +1780,8 @@ static void vmx_do_invlpg(unsigned long 
      * We do the safest things first, then try to update the shadow
      * copying from guest
      */
-    paging_invlpg(v, va);
+    if ( paging_invlpg(v, va) )
+        vpid_sync_vcpu_gva(v, va);
 }
 
 /*
@@ -1974,7 +2267,7 @@ static int vmx_world_restore(struct vcpu
     v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
     __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
 
-    if ( !vmx_paging_enabled(v) )
+    if ( !vmx_paging_enabled(v) || paging_mode_hap(v->domain) )
         goto skip_cr3;
 
     if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
@@ -2011,10 +2304,18 @@ static int vmx_world_restore(struct vcpu
     else
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
 
-    __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
     v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
     __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
 
+    if ( paging_mode_shadow(v->domain) )
+        __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
+    else
+    {
+        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
+        vmx_update_guest_cr(v, 3);
+        vmx_update_guest_cr(v, 4);
+    }
+
     __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
     __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
 
@@ -2153,10 +2454,11 @@ static int vmx_assist(struct vcpu *v, in
 static int vmx_set_cr0(unsigned long value)
 {
     struct vcpu *v = current;
-    unsigned long mfn;
+    struct domain *d = v->domain;
     unsigned long eip;
     int paging_enabled;
     unsigned long old_cr0;
+    unsigned long mfn;
     unsigned long old_base_mfn;
 
     HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
@@ -2181,12 +2483,23 @@ static int vmx_set_cr0(unsigned long val
     paging_enabled = old_cr0 & X86_CR0_PG;
 
     v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
-                               | X86_CR0_NE | X86_CR0_WP);
+                               | X86_CR0_NE);
+
+    if ( paging_mode_shadow(d) )
+        v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_WP;
+
     __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
 
     v->arch.hvm_vmx.cpu_shadow_cr0 = value;
     __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
 
+    if ( paging_mode_hap(d) )
+    {
+        vmx_update_guest_cr(v, 0);
+        vmx_update_guest_cr(v, 3);
+        vmx_update_guest_cr(v, 4);
+    }
+
     /* Trying to enable paging. */
     if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
     {
@@ -2207,37 +2520,40 @@ static int vmx_set_cr0(unsigned long val
         /*
          * The guest CR3 must be pointing to the guest physical.
          */
-        mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
-        if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
+        if ( paging_mode_shadow(v->domain) )
         {
-            gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
-                     v->arch.hvm_vmx.cpu_cr3, mfn);
-            domain_crash(v->domain);
-            return 0;
-        }
+            mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
+            if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
+            {
+                gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
+                  v->arch.hvm_vmx.cpu_cr3, mfn);
+                domain_crash(v->domain);
+                return 0;
+            }
 
-        /*
-         * Now arch.guest_table points to machine physical.
-         */
-        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-        v->arch.guest_table = pagetable_from_pfn(mfn);
-        if ( old_base_mfn )
-            put_page(mfn_to_page(old_base_mfn));
+            /*
+             * Now arch.guest_table points to machine physical.
+             */
+            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+            v->arch.guest_table = pagetable_from_pfn(mfn);
+            if ( old_base_mfn )
+                put_page(mfn_to_page(old_base_mfn));
 
-        paging_update_paging_modes(v);
+            HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
+                        (unsigned long) (mfn << PAGE_SHIFT));
 
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
-                    (unsigned long) (mfn << PAGE_SHIFT));
+            HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
+                        v->arch.hvm_vmx.cpu_cr3, mfn);
+        }
 
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
-                    v->arch.hvm_vmx.cpu_cr3, mfn);
+        paging_update_paging_modes(v);
     }
 
     /* Trying to disable paging. */
     if ( ((value & (X86_CR0_PE | X86_CR0_PG)) != (X86_CR0_PE | X86_CR0_PG)) &&
          paging_enabled )
     {
-        if ( v->arch.hvm_vmx.cpu_cr3 )
+        if ( v->arch.hvm_vmx.cpu_cr3 && paging_mode_shadow(v->domain) )
         {
             put_page(mfn_to_page(get_mfn_from_gpfn(
                       v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
@@ -2316,7 +2632,7 @@ static int vmx_set_cr0(unsigned long val
  */
 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
 {
-    unsigned long value, old_cr, old_base_mfn, mfn;
+    unsigned long value, old_cr;
     struct vcpu *v = current;
     struct vlapic *vlapic = vcpu_vlapic(v);
 
@@ -2348,45 +2664,8 @@ static int mov_to_cr(int gp, int cr, str
         return vmx_set_cr0(value);
 
     case 3:
-        /*
-         * If paging is not enabled yet, simply copy the value to CR3.
-         */
-        if ( !vmx_paging_enabled(v) )
-        {
-            v->arch.hvm_vmx.cpu_cr3 = value;
-            break;
-        }
-
-        /*
-         * We make a new one if the shadow does not exist.
-         */
-        if ( value == v->arch.hvm_vmx.cpu_cr3 ) {
-            /*
-             * This is simple TLB flush, implying the guest has
-             * removed some translation or changed page attributes.
-             * We simply invalidate the shadow.
-             */
-            mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
-            if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
-                goto bad_cr3;
-            paging_update_cr3(v);
-        } else {
-            /*
-             * If different, make a shadow. Check if the PDBR is valid
-             * first.
-             */
-            HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
-            mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
-            if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
-                goto bad_cr3;
-            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-            v->arch.guest_table = pagetable_from_pfn(mfn);
-            if ( old_base_mfn )
-                put_page(mfn_to_page(old_base_mfn));
-            v->arch.hvm_vmx.cpu_cr3 = value;
-            update_cr3(v);
-            HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
-        }
+	if (vmx_set_cr3(value) != X86EMUL_OKAY)
+	    goto bad_cr3;
         break;
 
     case 4: /* CR4 */
@@ -2403,7 +2682,7 @@ static int mov_to_cr(int gp, int cr, str
 
         if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
         {
-            if ( vmx_pgbit_test(v) )
+            if ( vmx_pgbit_test(v) && paging_mode_shadow(v->domain) )
             {
                 /* The guest is a 32-bit PAE guest. */
 #if CONFIG_PAGING_LEVELS >= 3
@@ -2441,10 +2720,17 @@ static int mov_to_cr(int gp, int cr, str
             }
         }
 
-        __vmwrite(GUEST_CR4, value | HVM_CR4_HOST_MASK);
         v->arch.hvm_vmx.cpu_shadow_cr4 = value;
         __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
 
+        if ( paging_mode_shadow(v->domain) )
+            __vmwrite(GUEST_CR4, (value | HVM_CR4_HOST_MASK));
+        else
+        {
+            vmx_update_guest_cr(v, 3);
+            vmx_update_guest_cr(v, 4);
+        }
+
         /*
          * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
          * all TLB entries except global entries.
@@ -2572,7 +2858,7 @@ static int vmx_do_msr_read(struct cpu_us
     switch ( ecx )
     {
     case MSR_IA32_TIME_STAMP_COUNTER:
-        msr_content = hvm_get_guest_time(v);
+        msr_content = hvm_get_guest_tsc(v);
         break;
     case MSR_IA32_SYSENTER_CS:
         msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
@@ -2655,8 +2941,9 @@ struct page_info * change_guest_physmap_
 
         mfn = page_to_mfn(pg);
         d->arch.hvm_domain.apic_access_page = pg;
+        d->arch.hvm_domain.vmx_apic_access_mfn = mfn;
 
-        guest_physmap_add_page(d, pfn, mfn);
+        guest_physmap_add_page(d, pfn, mfn, 0);
 
         d->arch.hvm_domain.physmap_changed_for_vlapic_access = 1;
 
@@ -2667,7 +2954,7 @@ struct page_info * change_guest_physmap_
         if ( d->arch.hvm_domain.physmap_changed_for_vlapic_access )
         {
             mfn = page_to_mfn(pg);
-            guest_physmap_remove_page(d, pfn, mfn);
+            guest_physmap_remove_page(d, pfn, mfn, 0);
             flush_tlb_mask(d->domain_dirty_cpumask);
 
             d->arch.hvm_domain.physmap_changed_for_vlapic_access = 0;
@@ -2716,7 +3003,7 @@ static void check_vlapic_msr_for_vtpr(st
         vcpu_vlapic(v)->mmap_vtpr_enabled = 1;
 
         v->arch.hvm_vcpu.u.vmx.exec_control |=
-            ( ACTIVATE_SECONDARY_CONTROLS | CPU_BASED_TPR_SHADOW );
+            ( CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | CPU_BASED_TPR_SHADOW );
         __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
                   v->arch.hvm_vcpu.u.vmx.exec_control);
         tmp  = __vmread(SECONDARY_VM_EXEC_CONTROL);
@@ -2752,7 +3039,7 @@ static int vmx_do_msr_write(struct cpu_u
     switch ( ecx )
     {
     case MSR_IA32_TIME_STAMP_COUNTER:
-        hvm_set_guest_time(v, msr_content);
+        hvm_set_guest_tsc(v, msr_content);
         pt_reset(v);
         break;
     case MSR_IA32_SYSENTER_CS:
@@ -2843,45 +3130,60 @@ static void vmx_do_extint(struct cpu_use
     }
 }
 
-static void vmx_reflect_exception(struct vcpu *v)
+static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
 {
-    int error_code, intr_info, vector;
-
-    intr_info = __vmread(VM_EXIT_INTR_INFO);
-    vector = intr_info & 0xff;
-    if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
-        error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
-    else
-        error_code = VMX_DELIVER_NO_ERROR_CODE;
+    unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
+    struct domain *d = current->domain;
+    u64 gfn = gpa >> PAGE_SHIFT;
+    mfn_t mfn;
+    p2m_type_t t;
+
+    /* GPA exceeds GAW. */
+    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
+    {
+        printk("EPT violation: guest physical address %"PRIpaddr" exceeded "
+               "its width limit.\n", gpa);
+        domain_crash(d);
+    }
+
+    /* The validity of the guest-linear adddress field has 4 values:
+     * 00   -   EPT_GLA_VALIDITY_PDPTR_LOAD
+     * 01   -   EPT_GLA_VALIDITY_GPT_WALK
+     * 10   -   EPT_GLA_VALIDITY_RSVD
+     * 11   -   EPT_GLA_VALIDITY_MATCH
+     *
+     * 11 is the normal case, and 01 also contains the situations
+     * No-write EPT page encounted when trying to write an A or D
+     * bits. When we in log-dirty mode, it may occurs.
+     */
 
-#ifndef NDEBUG
+    if ( gla_validity == EPT_GLA_VALIDITY_RSVD ||
+         gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD )
     {
-        unsigned long rip;
-
-        rip = __vmread(GUEST_RIP);
-        HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
-                    rip, error_code);
+        printk("ept violation: reserved bit or pdptr load violation.\n");
+        domain_crash(d);
     }
-#endif /* NDEBUG */
 
-    /*
-     * According to Intel Virtualization Technology Specification for
-     * the IA-32 Intel Architecture (C97063-002 April 2005), section
-     * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
-     * HW_EXCEPTION used for everything else.  The main difference
-     * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
-     * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
-     * it is not.
-     */
-    if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
+    mfn = ept_get_entry(d, gfn, &t);
+
+    if ( unlikely( gla_validity != EPT_GLA_VALIDITY_MATCH) )
     {
-        int ilen = __get_instruction_length(); /* Safe: software exception */
-        vmx_inject_sw_exception(v, vector, ilen);
+        if ( !p2m_is_ram(t) || !paging_mode_log_dirty(d) )
+        {
+            domain_crash(d);
+            return;
+        }
     }
-    else
+
+    if ( p2m_is_ram(t) && paging_mode_log_dirty(d) )
     {
-        vmx_inject_hw_exception(v, vector, error_code);
+        paging_mark_dirty(d, mfn_x(mfn));
+        p2m_set_flags(d, gpa, __PAGE_HYPERVISOR|_PAGE_PSE);
+        flush_tlb_mask(d->domain_dirty_cpumask);
+        return;
     }
+    /* must be MMIO */
+    handle_mmio(gpa);
 }
 
 static void vmx_failed_vmentry(unsigned int exit_reason,
@@ -2920,10 +3222,19 @@ static void vmx_failed_vmentry(unsigned 
 
 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
-    unsigned int exit_reason;
+    unsigned int exit_reason, idtv_info;
     unsigned long exit_qualification, inst_len = 0;
     struct vcpu *v = current;
 
+    if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
+    {
+        __asm__ __volatile__ ("mov"__OS" %%cr2, %0"
+                              : "=r"(v->arch.hvm_vmx.cpu_cr2));
+
+        /* __hvm_copy() need this when paging is enabled. */
+        v->arch.hvm_vmx.cpu_cr3 = __vmread(GUEST_CR3);
+    }
+ 
     exit_reason = __vmread(VM_EXIT_REASON);
 
     HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);
@@ -2936,6 +3247,33 @@ asmlinkage void vmx_vmexit_handler(struc
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
         return vmx_failed_vmentry(exit_reason, regs);
 
+    /* Event delivery caused this intercept? Queue for redelivery. */
+    idtv_info = __vmread(IDT_VECTORING_INFO_FIELD);
+    if ( unlikely(idtv_info & INTR_INFO_VALID_MASK)
+	  && (exit_reason != EXIT_REASON_TASK_SWITCH) )
+    {
+        if ( vmx_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
+        {
+            /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
+            __vmwrite(VM_ENTRY_INTR_INFO_FIELD,
+                      idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
+            if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
+                __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                          __vmread(IDT_VECTORING_ERROR_CODE));
+        }
+
+        /*
+         * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
+         * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
+         */
+        if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
+            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
+                      __vmread(GUEST_INTERRUPTIBILITY_INFO) &
+                      ~VMX_INTR_SHADOW_NMI);
+    }
+
+    hvm_maybe_deassert_evtchn_irq();
+
     switch ( exit_reason )
     {
     case EXIT_REASON_EXCEPTION_NMI:
@@ -2952,14 +3290,38 @@ asmlinkage void vmx_vmexit_handler(struc
 
         vector = intr_info & INTR_INFO_VECTOR_MASK;
 
+        /*
+         * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
+         * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
+         * (NB. If we emulate this IRET for any reason, we should re-clear!)
+         */
+        if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
+             !(__vmread(IDT_VECTORING_INFO_FIELD) & INTR_INFO_VALID_MASK) &&
+             (vector != TRAP_double_fault) )
+            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
+                    __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
+
         perfc_incra(cause_vector, vector);
 
         switch ( vector )
         {
         case TRAP_debug:
+            /*
+             * Updates DR6 where debugger can peek (See 3B 23.2.1,
+             * Table 23-1, "Exit Qualification for Debug Exceptions").
+             */
+            exit_qualification = __vmread(EXIT_QUALIFICATION);
+            write_debugreg(6, exit_qualification | 0xffff0ff0);
+            if ( !v->domain->debugger_attached )
+                goto exit_and_crash;
+            domain_pause_for_debugger();
+            break;
         case TRAP_int3:
             if ( !v->domain->debugger_attached )
                 goto exit_and_crash;
+            inst_len = __get_instruction_length(); /* Safe: INT3 */
+            __update_guest_eip(inst_len);
+            current->arch.gdbsx_vcpu_event = TRAP_int3;
             domain_pause_for_debugger();
             break;
         case TRAP_no_device:
@@ -2985,14 +3347,11 @@ asmlinkage void vmx_vmexit_handler(struc
             vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
             break;
         case TRAP_nmi:
-            if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
-            {
-                HVMTRACE_0D(NMI, v);
-                vmx_store_cpu_guest_regs(v, regs, NULL);
-                do_nmi(regs); /* Real NMI, vector 2: normal processing. */
-            }
-            else
-                vmx_reflect_exception(v);
+            if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) != INTR_TYPE_NMI )
+                goto exit_and_crash;
+            HVMTRACE_0D(NMI, v);
+            vmx_store_cpu_guest_regs(v, regs, NULL);
+            do_nmi(regs); /* Real NMI, vector 2: normal processing. */
             break;
         case TRAP_machine_check:
             HVMTRACE_0D(MCE, v);
@@ -3022,8 +3381,21 @@ asmlinkage void vmx_vmexit_handler(struc
         __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
                   v->arch.hvm_vmx.exec_control);
         break;
-    case EXIT_REASON_TASK_SWITCH:
-        goto exit_and_crash;
+    case EXIT_REASON_TASK_SWITCH: {
+        const enum hvm_task_switch_reason reasons[] = {
+            TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
+        int32_t errcode = -1;
+        unsigned int idtv_info;
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        idtv_info = __vmread(IDT_VECTORING_INFO_FIELD);
+        if ( (idtv_info & INTR_INFO_VALID_MASK) &&
+             (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
+            errcode = __vmread(IDT_VECTORING_ERROR_CODE);
+        hvm_task_switch((uint16_t)exit_qualification,
+                        reasons[(exit_qualification >> 30) & 3],
+                        errcode);
+        break;
+    }
     case EXIT_REASON_CPUID:
         inst_len = __get_instruction_length(); /* Safe: CPUID */
         __update_guest_eip(inst_len);
@@ -3113,6 +3485,21 @@ asmlinkage void vmx_vmexit_handler(struc
         break;
     }
 
+    case EXIT_REASON_EPT_VIOLATION:
+    {
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+#ifdef __i386__
+        gpa += (unsigned long long)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
+#endif
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        ept_handle_violation(exit_qualification, gpa);
+        break;
+    }
+
+    case EXIT_REASON_EPT_MISCONFIG:
+        domain_crash(current->domain);
+        break;
+
     default:
     exit_and_crash:
         gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
@@ -3127,6 +3514,22 @@ asmlinkage void vmx_trace_vmentry(void)
     HVMTRACE_0D(VMENTRY, v);
 }
 
+static void __ept_sync_domain(void *info)
+{
+    struct domain *d = info;
+    __invept(1, d->vcpu[0]->arch.hvm_vmx.ept_control.eptp, 0);
+}
+
+void ept_sync_domain(struct domain *d)
+{
+    /* Only if using EPT and this domain has some VCPUs to dirty. */
+    if ( hap_enabled(d) && d->vcpu[0] )
+    {
+        ASSERT(local_irq_is_enabled());
+        on_each_cpu(__ept_sync_domain, d, 1, 1);
+    }
+}
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/x86/hvm/vpic.c xen-redhat/arch/x86/hvm/vpic.c
--- xen/arch/x86/hvm/vpic.c
+++ xen-redhat/arch/x86/hvm/vpic.c
@@ -56,7 +56,7 @@ static int vpic_get_priority(struct hvm_
 
     /* prio = ffs(mask ROR vpic->priority_add); */
     asm ( "ror %%cl,%b1 ; bsf %1,%0"
-          : "=r" (prio) : "r" ((uint32_t)mask), "c" (vpic->priority_add) );
+          : "=r" (prio) : "q" ((uint32_t)mask), "c" (vpic->priority_add) );
     return prio;
 }
 
@@ -109,9 +109,12 @@ static void vpic_update_int_output(struc
     {
         if ( vpic->is_master )
         {
-            /* Master INT line is connected to VCPU0's VLAPIC LVT0. */
-            struct vcpu *v = vpic_domain(vpic)->vcpu[0];
-            if ( (v != NULL) && vlapic_accept_pic_intr(v) )
+            /*
+	     * Master INT line is connected to whatever VCPU has its LAPIC
+	     * LVT0 set up to receive ExtINT IRQs.
+	     */
+            struct vcpu *v = vpic_domain(vpic)->arch.hvm_domain.i8259_target;
+            if ( v != NULL )
                 vcpu_kick(v);
         }
         else
@@ -182,8 +185,7 @@ static void vpic_ioport_write(
 
     vpic_lock(vpic);
 
-    addr &= 1;
-    if ( addr == 0 )
+    if ( (addr & 1) == 0 )
     {
         if ( val & 0x10 )
         {
@@ -250,7 +252,13 @@ static void vpic_ioport_write(
                 vpic->isr &= ~(1 << irq);
                 if ( cmd == 7 )
                     vpic->priority_add = (irq + 1) & 7;
-                break;
+                /* Release lock and EOI the physical interrupt (if any). */
+                vpic_update_int_output(vpic);
+                vpic_unlock(vpic);
+                hvm_dpci_eoi(current->domain,
+                             hvm_isa_irq_to_gsi((addr >> 7) ? (irq|8) : irq),
+                             NULL);
+                return; /* bail immediately */
             case 6: /* Set Priority                */
                 vpic->priority_add = (val + 1) & 7;
                 break;
@@ -499,7 +507,7 @@ void vpic_irq_negative_edge(struct domai
         vpic_update_int_output(vpic);
 }
 
-int cpu_get_pic_interrupt(struct vcpu *v, int *type)
+int cpu_get_pic_interrupt(struct vcpu *v)
 {
     int irq, vector;
     struct hvm_hw_vpic *vpic = &v->domain->arch.hvm_domain.vpic[0];
@@ -512,6 +520,5 @@ int cpu_get_pic_interrupt(struct vcpu *v
         return -1;
 
     vector = vpic[irq >> 3].irq_base + (irq & 7);
-    *type = APIC_DM_EXTINT;
     return vector;
 }
diff -Naurp xen/arch/x86/hvm/vpt.c xen-redhat/arch/x86/hvm/vpt.c
--- xen/arch/x86/hvm/vpt.c
+++ xen-redhat/arch/x86/hvm/vpt.c
@@ -15,7 +15,6 @@
  * You should have received a copy of the GNU General Public License along with
  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
  */
 
 #include <xen/time.h>
@@ -23,6 +22,66 @@
 #include <asm/hvm/vpt.h>
 #include <asm/event.h>
 
+#define mode_is(d, name) \
+    ((d)->arch.hvm_domain.params[HVM_PARAM_TIMER_MODE] == HVMPTM_##name)
+
+void hvm_init_guest_time(struct domain *d)
+{
+    struct pl_time *pl = &d->arch.hvm_domain.pl_time;
+
+    spin_lock_init(&pl->pl_time_lock);
+    pl->stime_offset = -(u64)get_s_time();
+    pl->last_guest_time = 0;
+}
+
+u64 hvm_get_guest_time(struct vcpu *v)
+{
+    struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time;
+    u64 now;
+
+    /* Called from device models shared with PV guests. Be careful. */
+    ASSERT(is_hvm_vcpu(v));
+
+    spin_lock(&pl->pl_time_lock);
+    now = get_s_time() + pl->stime_offset;
+    if ( (int64_t)(now - pl->last_guest_time) > 0 )
+        pl->last_guest_time = now;
+    else
+        now = ++pl->last_guest_time;
+    spin_unlock(&pl->pl_time_lock);
+
+    return now + v->arch.hvm_vcpu.stime_offset;
+}
+
+void hvm_set_guest_time(struct vcpu *v, u64 guest_time)
+{
+    v->arch.hvm_vcpu.stime_offset += guest_time - hvm_get_guest_time(v);
+}
+
+static int pt_irq_vector(struct periodic_time *pt, enum hvm_intack src)
+{
+    struct vcpu *v = pt->vcpu;
+
+    if ( pt->source == PTSRC_lapic )
+        return pt->irq;
+
+    return get_isa_irq_vector(v, pt->irq, src);
+}
+
+static int pt_irq_masked(struct periodic_time *pt)
+{
+    struct vcpu *v = pt->vcpu;
+
+    if ( pt->source == PTSRC_lapic )
+    {
+        struct vlapic *vlapic = vcpu_vlapic(v);
+        return (!vlapic_enabled(vlapic) ||
+                (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED));
+    }
+
+    return is_isa_irq_masked(v, pt->irq);
+}
+
 static void pt_lock(struct periodic_time *pt)
 {
     struct vcpu *v;
@@ -42,29 +101,46 @@ static void pt_unlock(struct periodic_ti
     spin_unlock(&pt->vcpu->arch.hvm_vcpu.tm_lock);
 }
 
-static void missed_ticks(struct periodic_time *pt)
+static void pt_process_missed_ticks(struct periodic_time *pt)
 {
-    s_time_t missed_ticks;
+    s_time_t missed_ticks, now = NOW();
 
-    missed_ticks = NOW() - pt->scheduled;
+    if ( pt->one_shot )
+        return;
+
+    missed_ticks = now - pt->scheduled;
     if ( missed_ticks <= 0 )
         return;
 
     missed_ticks = missed_ticks / (s_time_t) pt->period + 1;
-    if ( missed_ticks > 1000 )
-    {
-        /* TODO: Adjust guest time together */
-        pt->pending_intr_nr++;
-    }
+    if ( mode_is(pt->vcpu->domain, no_missed_ticks_pending) )
+        pt->do_not_freeze = !pt->pending_intr_nr;
     else
-    {
         pt->pending_intr_nr += missed_ticks;
-    }
-
     pt->scheduled += missed_ticks * pt->period;
 }
 
-void pt_freeze_time(struct vcpu *v)
+static void pt_freeze_time(struct vcpu *v)
+{
+    if ( !mode_is(v->domain, delay_for_missed_ticks) )
+        return;
+
+    v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
+}
+
+static void pt_thaw_time(struct vcpu *v)
+{
+    if ( !mode_is(v->domain, delay_for_missed_ticks) )
+        return;
+
+    if ( v->arch.hvm_vcpu.guest_time == 0 )
+         return;
+
+    hvm_set_guest_time(v, v->arch.hvm_vcpu.guest_time);
+    v->arch.hvm_vcpu.guest_time = 0;
+}
+
+void pt_save_timer(struct vcpu *v)
 {
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
     struct periodic_time *pt;
@@ -74,33 +150,30 @@ void pt_freeze_time(struct vcpu *v)
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
-    v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
-
     list_for_each_entry ( pt, head, list )
-        stop_timer(&pt->timer);
+        if ( !pt->do_not_freeze )
+            stop_timer(&pt->timer);
+
+    pt_freeze_time(v);
 
     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
 }
 
-void pt_thaw_time(struct vcpu *v)
+void pt_restore_timer(struct vcpu *v)
 {
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
     struct periodic_time *pt;
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
-    if ( v->arch.hvm_vcpu.guest_time )
+    list_for_each_entry ( pt, head, list )
     {
-        hvm_set_guest_time(v, v->arch.hvm_vcpu.guest_time);
-        v->arch.hvm_vcpu.guest_time = 0;
-
-        list_for_each_entry ( pt, head, list )
-        {
-            missed_ticks(pt);
-            set_timer(&pt->timer, pt->scheduled);
-        }
+        pt_process_missed_ticks(pt);
+        set_timer(&pt->timer, pt->scheduled);
     }
 
+    pt_thaw_time(v);
+
     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
 }
 
@@ -111,12 +184,14 @@ static void pt_timer_fn(void *data)
     pt_lock(pt);
 
     pt->pending_intr_nr++;
-    pt->scheduled += pt->period;
-
-    missed_ticks(pt);
+    pt->do_not_freeze = 0;
 
     if ( !pt->one_shot )
+    {
+        pt->scheduled += pt->period;
+        pt_process_missed_ticks(pt);
         set_timer(&pt->timer, pt->scheduled);
+    }
 
     vcpu_kick(pt->vcpu);
 
@@ -126,67 +201,62 @@ static void pt_timer_fn(void *data)
 void pt_update_irq(struct vcpu *v)
 {
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
-    struct periodic_time *pt;
+    struct periodic_time *pt, *earliest_pt = NULL;
     uint64_t max_lag = -1ULL;
-    int irq = -1;
+    int irq, is_lapic;
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
     list_for_each_entry ( pt, head, list )
     {
-        if ( !is_isa_irq_masked(v, pt->irq) && pt->pending_intr_nr &&
+        if ( !pt_irq_masked(pt) && pt->pending_intr_nr &&
              ((pt->last_plt_gtime + pt->period_cycles) < max_lag) )
         {
             max_lag = pt->last_plt_gtime + pt->period_cycles;
-            irq = pt->irq;
+            earliest_pt = pt;
         }
     }
 
+    if ( earliest_pt == NULL )
+    {
+        spin_unlock(&v->arch.hvm_vcpu.tm_lock);
+        return;
+    }
+
+    earliest_pt->irq_issued = 1;
+    irq = earliest_pt->irq;
+    is_lapic = (earliest_pt->source == PTSRC_lapic);
+
     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
 
-    if ( is_lvtt(v, irq) )
+    if ( is_lapic )
     {
         vlapic_set_irq(vcpu_vlapic(v), irq, 0);
     }
-    else if ( irq >= 0 )
+    else
     {
         hvm_isa_irq_deassert(v->domain, irq);
         hvm_isa_irq_assert(v->domain, irq);
     }
 }
 
-static struct periodic_time *is_pt_irq(struct vcpu *v, int vector, int type)
+static struct periodic_time *is_pt_irq(
+    struct vcpu *v, int vector, enum hvm_intack src)
 {
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
     struct periodic_time *pt;
-    struct RTCState *rtc = &v->domain->arch.hvm_domain.pl_time.vrtc;
-    int vec;
 
     list_for_each_entry ( pt, head, list )
     {
-        if ( !pt->pending_intr_nr )
-            continue;
-
-        if ( is_lvtt(v, pt->irq) )
-        {
-            if ( pt->irq != vector )
-                continue;
+        if ( pt->pending_intr_nr && pt->irq_issued &&
+             (vector == pt_irq_vector(pt, src)) )
             return pt;
-        }
-
-        vec = get_isa_irq_vector(v, pt->irq, type);
-
-        /* RTC irq need special care */
-        if ( (vector != vec) || (pt->irq == 8 && !is_rtc_periodic_irq(rtc)) )
-            continue;
-
-        return pt;
     }
 
     return NULL;
 }
 
-void pt_intr_post(struct vcpu *v, int vector, int type)
+void pt_intr_post(struct vcpu *v, int vector, enum hvm_intack src)
 {
     struct periodic_time *pt;
     time_cb *cb;
@@ -194,19 +264,38 @@ void pt_intr_post(struct vcpu *v, int ve
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
-    pt = is_pt_irq(v, vector, type);
+    pt = is_pt_irq(v, vector, src);
     if ( pt == NULL )
     {
         spin_unlock(&v->arch.hvm_vcpu.tm_lock);
         return;
     }
 
-    ASSERT(pt->vcpu == v);
+    pt->irq_issued = 0;
 
-    pt->pending_intr_nr--;
-    pt->last_plt_gtime += pt->period_cycles;
+    if ( pt->one_shot )
+    {
+        if ( pt->on_list )
+            list_del(&pt->list);
+        pt->on_list = 0;
+    }
+    else
+    {
+        if ( mode_is(v->domain, one_missed_tick_pending) ||
+             mode_is(v->domain, no_missed_ticks_pending) )
+        {
+            pt->last_plt_gtime = hvm_get_guest_time(v);
+            pt->pending_intr_nr = 0; /* 'collapse' all missed ticks */
+        }
+        else
+        {
+            pt->last_plt_gtime += pt->period_cycles;
+            pt->pending_intr_nr--;
+        }
+    }
 
-    if ( hvm_get_guest_time(v) < pt->last_plt_gtime )
+    if ( mode_is(v->domain, delay_for_missed_ticks) &&
+         (hvm_get_guest_time(v) < pt->last_plt_gtime) )
         hvm_set_guest_time(v, pt->last_plt_gtime);
 
     cb = pt->cb;
@@ -253,30 +342,36 @@ void create_periodic_time(
     struct vcpu *v, struct periodic_time *pt, uint64_t period,
     uint8_t irq, char one_shot, time_cb *cb, void *data)
 {
+    ASSERT(pt->source != 0);
+
     destroy_periodic_time(pt);
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
-    pt->enabled = 1;
     pt->pending_intr_nr = 0;
+    pt->do_not_freeze = 0;
+    pt->irq_issued = 0;
 
-    if ( period < 900000 ) /* < 0.9 ms */
+    /* Periodic timer must be at least 0.9ms. */
+    if ( (period < 900000) && !one_shot )
     {
         gdprintk(XENLOG_WARNING,
                  "HVM_PlatformTime: program too small period %"PRIu64"\n",
                  period);
-        period = 900000; /* force to 0.9ms */
+        period = 900000;
     }
+
     pt->period = period;
     pt->vcpu = v;
     pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu);
     pt->irq = irq;
-    pt->period_cycles = (u64)period * cpu_khz / 1000000L;
+    pt->period_cycles = (u64)period;
     pt->one_shot = one_shot;
     pt->scheduled = NOW() + period;
     pt->cb = cb;
     pt->priv = data;
 
+    pt->on_list = 1;
     list_add(&pt->list, &v->arch.hvm_vcpu.tm_list);
 
     init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
@@ -287,12 +382,14 @@ void create_periodic_time(
 
 void destroy_periodic_time(struct periodic_time *pt)
 {
-    if ( !pt->enabled )
+    /* Was this structure previously initialised by create_periodic_time()? */
+    if ( pt->vcpu == NULL )
         return;
 
     pt_lock(pt);
-    pt->enabled = 0;
-    list_del(&pt->list);
+    if ( pt->on_list )
+        list_del(&pt->list);
+    pt->on_list = 0;
     pt_unlock(pt);
 
     /*
@@ -301,3 +398,53 @@ void destroy_periodic_time(struct period
      */
     kill_timer(&pt->timer);
 }
+
+static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
+{
+    int on_list;
+
+    ASSERT(pt->source == PTSRC_isa);
+
+    if ( pt->vcpu == NULL )
+        return;
+
+    pt_lock(pt);
+    on_list = pt->on_list;
+    if ( pt->on_list )
+        list_del(&pt->list);
+    pt->on_list = 0;
+    pt_unlock(pt);
+
+    spin_lock(&v->arch.hvm_vcpu.tm_lock);
+    pt->vcpu = v;
+    if ( on_list )
+    {
+        pt->on_list = 1;
+        list_add(&pt->list, &v->arch.hvm_vcpu.tm_list);
+
+        migrate_timer(&pt->timer, v->processor);
+    }
+    spin_unlock(&v->arch.hvm_vcpu.tm_lock);
+}
+
+void pt_adjust_global_vcpu_target(struct vcpu *v)
+{
+    struct pl_time *pl_time = &v->domain->arch.hvm_domain.pl_time;
+
+    if ( v == NULL )
+        return;
+
+    ASSERT(v == v->domain->arch.hvm_domain.i8259_target);
+    
+    spin_lock(&pl_time->vpit.lock);
+    pt_adjust_vcpu(&pl_time->vpit.pt[0], v);
+    spin_unlock(&pl_time->vpit.lock);
+
+    spin_lock(&pl_time->vrtc.lock);
+    rtc_migrate_timers(v);
+    spin_unlock(&pl_time->vrtc.lock);
+
+    spin_lock(&pl_time->vhpet.lock);
+    hpet_migrate_timers(v);
+    spin_unlock(&pl_time->vhpet.lock);
+}
diff -Naurp xen/arch/x86/i8259.c xen-redhat/arch/x86/i8259.c
--- xen/arch/x86/i8259.c
+++ xen-redhat/arch/x86/i8259.c
@@ -395,6 +395,8 @@ void __init init_IRQ(void)
         irq_desc[i].handler = &no_irq_type;
         irq_desc[i].action  = NULL;
         irq_desc[i].depth   = 1;
+        irq_desc[i].vector = i;
+        INIT_LIST_HEAD(&irq_desc[i].rl_link);
         spin_lock_init(&irq_desc[i].lock);
         set_intr_gate(i, interrupt[i]);
     }
@@ -405,6 +407,10 @@ void __init init_IRQ(void)
         irq_desc[LEGACY_VECTOR(i)].handler = &i8259A_irq_type;
     }
 
+    /* Never allocate the hypercall vector or Linux/BSD fast-trap vector. */
+    vector_irq[HYPERCALL_VECTOR] = NEVER_ASSIGN;
+    vector_irq[0x80] = NEVER_ASSIGN;
+
     apic_intr_init();
 
     /* Set the clock to HZ Hz */
diff -Naurp xen/arch/x86/io_apic.c xen-redhat/arch/x86/io_apic.c
--- xen/arch/x86/io_apic.c
+++ xen-redhat/arch/x86/io_apic.c
@@ -27,16 +27,17 @@
 #include <xen/delay.h>
 #include <xen/sched.h>
 #include <xen/acpi.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
 #include <xen/keyhandler.h>
 #include <asm/io.h>
 #include <asm/mc146818rtc.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
+#include <asm/msi.h>
 #include <mach_apic.h>
 #include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
+#include <public/physdev.h>
 
 /* Different to Linux: our implementation can be simpler. */
 #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
@@ -83,10 +84,13 @@ int disable_timer_pin_1 __initdata;
 
 static struct irq_pin_list {
     int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
+} irq_2_pin[PIN_MAP_SIZE] = {
+    [0 ... PIN_MAP_SIZE-1].pin = -1
+};
 static int irq_2_pin_free_entry = NR_IRQS;
 
-int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = {
+    [0 ... NR_VECTORS - 1] = FREE_TO_ASSIGN};
 
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -229,27 +233,32 @@ static void unmask_IO_APIC_irq (unsigned
     spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
+#define clear_IO_APIC_pin(a,p)     __clear_IO_APIC_pin(a,p,0)
+#define clear_IO_APIC_pin_raw(a,p) __clear_IO_APIC_pin(a,p,1)
+static void __clear_IO_APIC_pin(unsigned int apic, unsigned int pin, int raw)
+{
+    unsigned int (*read)(unsigned int, unsigned int)
+        = raw ? __io_apic_read : io_apic_read;
+    void (*write)(unsigned int, unsigned int, unsigned int)
+        = raw ? __io_apic_write : io_apic_write;
     struct IO_APIC_route_entry entry;
     unsigned long flags;
-	
+    
     /* Check delivery_mode to be sure we're not clearing an SMI pin */
     spin_lock_irqsave(&ioapic_lock, flags);
-    *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-    *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+    *(((int*)&entry) + 0) = (*read)(apic, 0x10 + 2 * pin);
+    *(((int*)&entry) + 1) = (*read)(apic, 0x11 + 2 * pin);
     spin_unlock_irqrestore(&ioapic_lock, flags);
     if (entry.delivery_mode == dest_SMI)
         return;
-
     /*
      * Disable it in the IO-APIC irq-routing table:
      */
     memset(&entry, 0, sizeof(entry));
     entry.mask = 1;
     spin_lock_irqsave(&ioapic_lock, flags);
-    io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
-    io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+    (*write)(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+    (*write)(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
     spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -257,9 +266,12 @@ static void clear_IO_APIC (void)
 {
     int apic, pin;
 
-    for (apic = 0; apic < nr_ioapics; apic++)
-        for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+    for (apic = 0; apic < nr_ioapics; apic++) {
+        for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
             clear_IO_APIC_pin(apic, pin);
+            clear_IO_APIC_pin_raw(apic, pin);
+        }
+    }
 }
 
 #ifdef CONFIG_SMP
@@ -663,42 +675,53 @@ static inline int IO_APIC_irq_trigger(in
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
+u8 irq_vector[NR_IRQS] __read_mostly;
+
+int free_irq_vector(int vector)
+{
+    int irq;
+
+    BUG_ON((vector > LAST_DYNAMIC_VECTOR) || (vector < FIRST_DYNAMIC_VECTOR));
+
+    spin_lock(&vector_lock);
+    if ((irq = vector_irq[vector]) == AUTO_ASSIGN)
+        vector_irq[vector] = FREE_TO_ASSIGN;
+    spin_unlock(&vector_lock);
+
+    return (irq == AUTO_ASSIGN) ? 0 : -EINVAL;
+}
 
 int assign_irq_vector(int irq)
 {
-    static unsigned current_vector = FIRST_DYNAMIC_VECTOR, offset = 0;
+    static unsigned current_vector = FIRST_DYNAMIC_VECTOR;
     unsigned vector;
 
-    BUG_ON(irq >= NR_IRQ_VECTORS);
+    BUG_ON(irq >= NR_IRQS);
+
     spin_lock(&vector_lock);
 
-    if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
+    if ((irq != AUTO_ASSIGN) && (irq_to_vector(irq) > 0)) {
+        spin_unlock(&vector_lock);
+        return irq_to_vector(irq);
+    }
+    if ((irq != AUTO_ASSIGN) && (IO_APIC_VECTOR(irq) > 0)) {
         spin_unlock(&vector_lock);
         return IO_APIC_VECTOR(irq);
     }
 
-next:
-    current_vector += 8;
+    vector = current_vector;
+    while (vector_irq[vector] != FREE_TO_ASSIGN) {
+        vector += 8;
+        if (vector > LAST_DYNAMIC_VECTOR)
+            vector = FIRST_DYNAMIC_VECTOR + ((vector + 1) & 7);
 
-    /* Skip the hypercall vector. */
-    if (current_vector == HYPERCALL_VECTOR)
-        goto next;
-
-    /* Skip the Linux/BSD fast-trap vector. */
-    if (current_vector == 0x80)
-        goto next;
-
-    if (current_vector > LAST_DYNAMIC_VECTOR) {
-        offset++;
-        if (!(offset%8)) {
+        if (vector == current_vector) {
             spin_unlock(&vector_lock);
             return -ENOSPC;
         }
-        current_vector = FIRST_DYNAMIC_VECTOR + offset;
     }
 
-    vector = current_vector;
+    current_vector = vector;
     vector_irq[vector] = irq;
     if (irq != AUTO_ASSIGN)
         IO_APIC_VECTOR(irq) = vector;
@@ -708,8 +731,8 @@ next:
     return vector;
 }
 
-static struct hw_interrupt_type ioapic_level_type;
-static struct hw_interrupt_type ioapic_edge_type;
+struct hw_interrupt_type ioapic_level_type;
+struct hw_interrupt_type ioapic_edge_type;
 
 #define IOAPIC_AUTO	-1
 #define IOAPIC_EDGE	0
@@ -1009,11 +1032,6 @@ static void __init enable_IO_APIC(void)
     int i, apic;
     unsigned long flags;
 
-    for (i = 0; i < PIN_MAP_SIZE; i++) {
-        irq_2_pin[i].pin = -1;
-        irq_2_pin[i].next = 0;
-    }
-
     /* Initialise dynamic irq_2_pin free list. */
     for (i = NR_IRQS; i < PIN_MAP_SIZE; i++)
         irq_2_pin[i].next = i + 1;
@@ -1104,6 +1122,7 @@ void disable_IO_APIC(void)
         entry.delivery_mode   = dest_ExtINT; /* ExtInt */
         entry.vector          = 0;
         entry.dest.physical.physical_dest =
+        //  TODO: BP: should be get_apic_id
             GET_APIC_ID(apic_read(APIC_ID));
 
         /*
@@ -1510,7 +1529,7 @@ static void end_edge_ioapic_vector(unsig
  * edge-triggered handler, without risking IRQ storms and other ugly
  * races.
  */
-static struct hw_interrupt_type ioapic_edge_type = {
+struct hw_interrupt_type ioapic_edge_type = {
     .typename 	= "IO-APIC-edge",
     .startup 	= startup_edge_ioapic_vector,
     .shutdown 	= disable_edge_ioapic_vector,
@@ -1521,7 +1540,7 @@ static struct hw_interrupt_type ioapic_e
     .set_affinity 	= set_ioapic_affinity_vector,
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+struct hw_interrupt_type ioapic_level_type = {
     .typename 	= "IO-APIC-level",
     .startup 	= startup_level_ioapic_vector,
     .shutdown 	= mask_IO_APIC_vector,
@@ -1532,6 +1551,50 @@ static struct hw_interrupt_type ioapic_l
     .set_affinity 	= set_ioapic_affinity_vector,
 };
 
+static unsigned int startup_msi_vector(unsigned int vector)
+{
+    unmask_msi_vector(vector);
+    return 0;
+}
+
+static void ack_msi_vector(unsigned int vector)
+{
+    if ( msi_maskable_irq(irq_desc[vector].msi_desc) )
+        ack_APIC_irq(); /* ACKTYPE_NONE */
+}
+
+static void end_msi_vector(unsigned int vector)
+{
+    if ( !msi_maskable_irq(irq_desc[vector].msi_desc) )
+        ack_APIC_irq(); /* ACKTYPE_EOI */
+}
+
+static void shutdown_msi_vector(unsigned int vector)
+{
+    mask_msi_vector(vector);
+}
+
+static void set_msi_affinity_vector(unsigned int vector, cpumask_t cpu_mask)
+{
+    set_native_irq_info(vector, cpu_mask);
+    set_msi_affinity(vector, cpu_mask);
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+struct hw_interrupt_type pci_msi_type = {
+    .typename   = "PCI-MSI",
+    .startup    = startup_msi_vector,
+    .shutdown   = shutdown_msi_vector,
+    .enable	    = unmask_msi_vector,
+    .disable    = mask_msi_vector,
+    .ack        = ack_msi_vector,
+    .end        = end_msi_vector,
+    .set_affinity   = set_msi_affinity_vector,
+};
+
 static inline void init_IO_APIC_traps(void)
 {
     int irq;
@@ -1649,6 +1712,9 @@ static inline void check_timer(void)
 {
     int apic1, pin1, apic2, pin2;
     int vector;
+    unsigned long flags;
+
+    local_irq_save(flags);
 
     /*
      * get/set the timer IRQ vector:
@@ -1690,6 +1756,7 @@ static inline void check_timer(void)
          */
         unmask_IO_APIC_irq(0);
         if (timer_irq_works()) {
+            local_irq_restore(flags);
             if (disable_timer_pin_1 > 0)
                 clear_IO_APIC_pin(apic1, pin1);
             return;
@@ -1707,6 +1774,7 @@ static inline void check_timer(void)
          */
         setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
         if (timer_irq_works()) {
+            local_irq_restore(flags);
             printk("works.\n");
             if (pin1 != -1)
                 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
@@ -1734,6 +1802,7 @@ static inline void check_timer(void)
     enable_8259A_irq(0);
 
     if (timer_irq_works()) {
+        local_irq_restore(flags);
         printk(" works.\n");
         return;
     }
@@ -1749,6 +1818,8 @@ static inline void check_timer(void)
 
     unlock_ExtINT_logic();
 
+    local_irq_restore(flags);
+
     if (timer_irq_works()) {
         printk(" works.\n");
         return;
@@ -2128,7 +2199,7 @@ int ioapic_guest_write(unsigned long phy
     if ( new_rte.vector >= FIRST_DYNAMIC_VECTOR )
         new_irq = vector_irq[new_rte.vector];
 
-    if ( (old_irq != new_irq) && (old_irq != -1) && IO_APIC_IRQ(old_irq) )
+    if ( (old_irq != new_irq) && (old_irq >= 0) && IO_APIC_IRQ(old_irq) )
     {
         if ( irq_desc[IO_APIC_VECTOR(old_irq)].action )
         {
@@ -2140,7 +2211,7 @@ int ioapic_guest_write(unsigned long phy
         remove_pin_at_irq(old_irq, apic, pin);
     }
 
-    if ( (new_irq != -1) && IO_APIC_IRQ(new_irq) )
+    if ( (new_irq >= 0) && IO_APIC_IRQ(new_irq) )
     {
         if ( irq_desc[IO_APIC_VECTOR(new_irq)].action )
         {
diff -Naurp xen/arch/x86/ioport_emulate.c xen-redhat/arch/x86/ioport_emulate.c
--- xen/arch/x86/ioport_emulate.c
+++ xen-redhat/arch/x86/ioport_emulate.c
@@ -0,0 +1,141 @@
+/******************************************************************************
+ * ioport_emulate.c
+ * 
+ * Handle I/O port access quirks of various platforms.
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/sched.h>
+#include <xen/dmi.h>
+
+/* Function pointer used to handle platform specific I/O port emulation. */
+extern void (*ioemul_handle_quirk)(
+    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
+
+static void ioemul_handle_proliant_quirk(
+    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
+{
+    uint16_t port = regs->edx;
+    uint8_t value = regs->eax;
+
+    if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
+        return;
+
+    /*    pushfw */
+    io_emul_stub[ 0] = 0x66;
+    io_emul_stub[ 1] = 0x9c;
+    /*    cli */
+    io_emul_stub[ 2] = 0xfa;
+    /*    out %al,%dx */
+    io_emul_stub[ 3] = 0xee;
+    /* 1: in %dx,%al */
+    io_emul_stub[ 4] = 0xec;
+    /*    test $0x80,%al */
+    io_emul_stub[ 5] = 0xa8;
+    io_emul_stub[ 6] = 0x80;
+    /*    jnz 1b */
+    io_emul_stub[ 7] = 0x75;
+    io_emul_stub[ 8] = 0xfb;
+    /*    popfw */
+    io_emul_stub[ 9] = 0x66;
+    io_emul_stub[10] = 0x9d;
+    /*    ret */
+    io_emul_stub[11] = 0xc3;
+}
+
+int __init proliant_quirk(struct dmi_system_id *d)
+{
+    ioemul_handle_quirk = ioemul_handle_proliant_quirk;
+    return 0;
+}
+
+/* This table is the set of system-specific I/O emulation hooks. */
+static struct dmi_system_id __initdata ioport_quirks_tbl[] = {
+    /*
+     * I/O emulation hook for certain HP ProLiant servers with
+     * 'special' SMM goodness.
+     */
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant DL3xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL3"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant DL5xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL5"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant DL7xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL7"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant ML3xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant ML3"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant ML5xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant ML5"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant BL2xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL2"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant BL4xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL4"),
+        },
+    },
+    {
+        .callback = proliant_quirk,
+        .ident = "HP ProLiant BL6xx",
+        .matches = {
+            DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+            DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL6"),
+        },
+    },
+    { }
+};
+
+int __init ioport_quirks_init(void)
+{
+    dmi_check_system(ioport_quirks_tbl);
+    return 0;
+}
+__initcall(ioport_quirks_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -Naurp xen/arch/x86/irq.c xen-redhat/arch/x86/irq.c
--- xen/arch/x86/irq.c
+++ xen-redhat/arch/x86/irq.c
@@ -14,8 +14,10 @@
 #include <xen/sched.h>
 #include <xen/keyhandler.h>
 #include <xen/compat.h>
+#include <xen/iocap.h>
+#include <asm/msi.h>
 #include <asm/current.h>
-#include <asm/smpboot.h>
+#include <public/physdev.h>
 
 /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */
 int opt_noirqbalance = 0;
@@ -23,6 +25,14 @@ boolean_param("noirqbalance", opt_noirqb
 
 irq_desc_t irq_desc[NR_IRQS];
 
+static LIST_HEAD(irq_ratelimit_list);
+static DEFINE_SPINLOCK(irq_ratelimit_lock);
+static struct timer irq_ratelimit_timer;
+
+/* irq_ratelimit: the max irq rate allowed in every 10ms, set 0 to disable */
+unsigned int __read_mostly irq_ratelimit_threshold = 10000;
+integer_param("irq_ratelimit", irq_ratelimit_threshold);
+
 static void __do_IRQ_guest(int vector);
 
 void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs) { }
@@ -98,6 +108,66 @@ asmlinkage void do_IRQ(struct cpu_user_r
     spin_unlock(&desc->lock);
 }
 
+static void irq_ratelimit_timer_fn(void *data)
+{
+    irq_desc_t *desc, *tmp;
+    unsigned long flags;
+
+    spin_lock_irqsave(&irq_ratelimit_lock, flags);
+
+    list_for_each_entry_safe ( desc, tmp, &irq_ratelimit_list, rl_link )
+    {
+        spin_lock(&desc->lock);
+        desc->handler->enable(desc->vector);
+        list_del(&desc->rl_link);
+        INIT_LIST_HEAD(&desc->rl_link);
+        spin_unlock(&desc->lock);
+    }
+
+    spin_unlock_irqrestore(&irq_ratelimit_lock, flags);
+}
+
+static int __init irq_ratelimit_init(void)
+{
+    if ( irq_ratelimit_threshold )
+        init_timer(&irq_ratelimit_timer, irq_ratelimit_timer_fn, NULL, 0);
+    return 0;
+}
+__initcall(irq_ratelimit_init);
+
+int request_irq(unsigned int irq,
+        void (*handler)(int, void *, struct cpu_user_regs *),
+        unsigned long irqflags, const char * devname, void *dev_id)
+{
+    struct irqaction * action;
+    int retval;
+
+    /*
+     * Sanity-check: shared interrupts must pass in a real dev-ID,
+     * otherwise we'll have trouble later trying to figure out
+     * which interrupt is which (messes up the interrupt freeing
+     * logic etc).
+     */
+    if (irq >= NR_IRQS)
+        return -EINVAL;
+    if (!handler)
+        return -EINVAL;
+
+    action = xmalloc(struct irqaction);
+    if (!action)
+        return -ENOMEM;
+
+    action->handler = handler;
+    action->name = devname;
+    action->dev_id = dev_id;
+
+    retval = setup_irq(irq, action);
+    if (retval)
+        xfree(action);
+
+    return retval;
+}
+ 
 void free_irq(unsigned int irq)
 {
     unsigned int  vector = irq_to_vector(irq);
@@ -168,9 +238,10 @@ struct pending_eoi {
 static DEFINE_PER_CPU(struct pending_eoi, pending_eoi[NR_VECTORS]);
 #define pending_eoi_sp(p) ((p)[NR_VECTORS-1].vector)
 
+extern struct hw_interrupt_type ioapic_level_type;
+
 static void __do_IRQ_guest(int vector)
 {
-    unsigned int        irq = vector_to_irq(vector);
     irq_desc_t         *desc = &irq_desc[vector];
     irq_guest_action_t *action = (irq_guest_action_t *)desc->action;
     struct domain      *d;
@@ -186,6 +257,37 @@ static void __do_IRQ_guest(int vector)
         return;
     }
 
+    if ( action->nr_guests == 1 && action->guest[0]->domain_id != 0 &&
+         desc->handler != &ioapic_level_type )
+    {
+        if ( irq_ratelimit_timer.function && /* irq rate limiting enabled? */
+             unlikely(desc->rl_cnt++ >= irq_ratelimit_threshold) )
+        {
+            s_time_t now = NOW();
+            if ( now < (desc->rl_quantum_start + MILLISECS(10)) )
+            {
+                desc->handler->disable(vector);
+                /*
+                 * If handler->disable doesn't actually mask the interrupt, a
+                 * disabled irq still can fire. This check also avoids possible
+                 * deadlocks if ratelimit_timer_fn runs at the same time.
+                 */
+                if ( likely(list_empty(&desc->rl_link)) )
+                {
+                    spin_lock(&irq_ratelimit_lock);
+                    if ( list_empty(&irq_ratelimit_list) )
+                        set_timer(&irq_ratelimit_timer, now + MILLISECS(10));
+                    list_add(&desc->rl_link, &irq_ratelimit_list);
+                    spin_unlock(&irq_ratelimit_lock);
+                }
+                desc->handler->end(vector);
+                return;
+            }
+            desc->rl_cnt = 0;
+            desc->rl_quantum_start = now;
+        }
+    }
+
     if ( action->ack_type == ACKTYPE_EOI )
     {
         sp = pending_eoi_sp(peoi);
@@ -199,12 +301,44 @@ static void __do_IRQ_guest(int vector)
 
     for ( i = 0; i < action->nr_guests; i++ )
     {
+        unsigned int irq;
         d = action->guest[i];
+        irq = domain_vector_to_irq(d, vector);
         if ( (action->ack_type != ACKTYPE_NONE) &&
              !test_and_set_bit(irq, d->pirq_mask) )
             action->in_flight++;
-        send_guest_pirq(d, irq);
+        if ( !hvm_do_IRQ_dpci(d, irq) )
+            send_guest_pirq(d, irq);
+    }
+}
+
+/*
+ * Retrieve Xen irq-descriptor corresponding to a domain-specific irq.
+ * The descriptor is returned locked. This function is safe against changes
+ * to the per-domain irq-to-vector mapping.
+ */
+irq_desc_t *domain_spin_lock_irq_desc(
+    struct domain *d, int irq, unsigned long *pflags)
+{
+    unsigned int vector;
+    unsigned long flags;
+    irq_desc_t *desc;
+
+    for ( ; ; )
+    {
+        vector = domain_irq_to_vector(d, irq);
+        if ( vector <= 0 )
+            return NULL;
+        desc = &irq_desc[vector];
+        spin_lock_irqsave(&desc->lock, flags);
+        if ( vector == domain_irq_to_vector(d, irq) )
+            break;
+        spin_unlock_irqrestore(&desc->lock, flags);
     }
+
+    if ( pflags != NULL )
+        *pflags = flags;
+    return desc;
 }
 
 /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */
@@ -270,11 +404,15 @@ static void __pirq_guest_eoi(struct doma
     irq_desc_t         *desc;
     irq_guest_action_t *action;
     cpumask_t           cpu_eoi_map;
+    int                 vector;
 
-    desc   = &irq_desc[irq_to_vector(irq)];
-    action = (irq_guest_action_t *)desc->action;
+    ASSERT(local_irq_is_enabled());
+    desc = domain_spin_lock_irq_desc(d, irq, NULL);
+    if ( desc == NULL )
+        return;
 
-    spin_lock_irq(&desc->lock);
+    action = (irq_guest_action_t *)desc->action;
+    vector = desc - irq_desc;
 
     ASSERT(!test_bit(irq, d->pirq_mask) ||
            (action->ack_type != ACKTYPE_NONE));
@@ -289,7 +427,7 @@ static void __pirq_guest_eoi(struct doma
     if ( action->ack_type == ACKTYPE_UNMASK )
     {
         ASSERT(cpus_empty(action->cpu_eoi_map));
-        desc->handler->end(irq_to_vector(irq));
+        desc->handler->end(vector);
         spin_unlock_irq(&desc->lock);
         return;
     }
@@ -341,13 +479,13 @@ int pirq_guest_unmask(struct domain *d)
 }
 
 extern int ioapic_ack_new;
-int pirq_acktype(int irq)
+int pirq_acktype(struct domain *d, int irq)
 {
     irq_desc_t  *desc;
     unsigned int vector;
 
-    vector = irq_to_vector(irq);
-    if ( vector == 0 )
+    vector = domain_irq_to_vector(d, irq);
+    if ( vector <= 0 )
         return ACKTYPE_NONE;
 
     desc = &irq_desc[vector];
@@ -364,6 +502,13 @@ int pirq_acktype(int irq)
         return ACKTYPE_NONE;
 
     /*
+     * MSIs are treated as edge-triggered interrupts, except
+     * when there is no proper way to mask them.
+     */
+    if ( desc->handler == &pci_msi_type )
+        return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI;
+
+    /*
      * Level-triggered IO-APIC interrupts need to be acknowledged on the CPU
      * on which they were received. This is because we tickle the LAPIC to EOI.
      */
@@ -387,23 +532,20 @@ int pirq_acktype(int irq)
     return 0;
 }
 
-int pirq_shared(int irq)
+int pirq_shared(struct domain *d, int irq)
 {
-    unsigned int        vector;
     irq_desc_t         *desc;
     irq_guest_action_t *action;
     unsigned long       flags;
     int                 shared;
 
-    vector = irq_to_vector(irq);
-    if ( vector == 0 )
+    desc = domain_spin_lock_irq_desc(d, irq, &flags);
+    if ( desc == NULL )
         return 0;
 
-    desc = &irq_desc[vector];
-
-    spin_lock_irqsave(&desc->lock, flags);
     action = (irq_guest_action_t *)desc->action;
     shared = ((desc->status & IRQ_GUEST) && (action->nr_guests > 1));
+
     spin_unlock_irqrestore(&desc->lock, flags);
 
     return shared;
@@ -413,21 +555,23 @@ int pirq_guest_bind(struct vcpu *v, int 
 {
     unsigned int        vector;
     irq_desc_t         *desc;
-    irq_guest_action_t *action;
-    unsigned long       flags;
+    irq_guest_action_t *action, *newaction = NULL;
     int                 rc = 0;
     cpumask_t           cpumask = CPU_MASK_NONE;
 
- retry:
-    vector = irq_to_vector(irq);
-    if ( vector == 0 )
-        return -EINVAL;
-
-    desc = &irq_desc[vector];
+    WARN_ON(!spin_is_locked(&v->domain->event_lock));
+    BUG_ON(!local_irq_is_enabled());
 
-    spin_lock_irqsave(&desc->lock, flags);
+ retry:
+    desc = domain_spin_lock_irq_desc(v->domain, irq, NULL);
+    if ( desc == NULL )
+    {
+        rc = -EINVAL;
+        goto out;
+    }
 
     action = (irq_guest_action_t *)desc->action;
+    vector = desc - irq_desc;
 
     if ( !(desc->status & IRQ_GUEST) )
     {
@@ -437,23 +581,29 @@ int pirq_guest_bind(struct vcpu *v, int 
                     "Cannot bind IRQ %d to guest. In use by '%s'.\n",
                     irq, desc->action->name);
             rc = -EBUSY;
-            goto out;
+            goto unlock_out;
         }
 
-        action = xmalloc(irq_guest_action_t);
-        if ( (desc->action = (struct irqaction *)action) == NULL )
+        if ( newaction == NULL )
         {
+            spin_unlock_irq(&desc->lock);
+            if ( (newaction = xmalloc(irq_guest_action_t)) != NULL )
+                goto retry;
             gdprintk(XENLOG_INFO,
-                    "Cannot bind IRQ %d to guest. Out of memory.\n",
-                    irq);
+                     "Cannot bind IRQ %d to guest. Out of memory.\n",
+                     irq);
             rc = -ENOMEM;
             goto out;
         }
 
+        action = newaction;
+        desc->action = (struct irqaction *)action;
+        newaction = NULL;
+
         action->nr_guests   = 0;
         action->in_flight   = 0;
         action->shareable   = will_share;
-        action->ack_type    = pirq_acktype(irq);
+        action->ack_type    = pirq_acktype(v->domain, irq);
         cpus_clear(action->cpu_eoi_map);
 
         desc->depth = 0;
@@ -468,11 +618,13 @@ int pirq_guest_bind(struct vcpu *v, int 
     }
     else if ( !will_share || !action->shareable )
     {
-        gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. "
-               "Will not share with others.\n",
-                irq);
+        gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. %s.\n",
+                 irq,
+                 will_share ?
+                 "Others do not share" :
+                 "Will not share with others");
         rc = -EBUSY;
-        goto out;
+        goto unlock_out;
     }
     else if ( action->nr_guests == 0 )
     {
@@ -482,7 +634,7 @@ int pirq_guest_bind(struct vcpu *v, int 
          */
         ASSERT(action->ack_type == ACKTYPE_EOI);
         ASSERT(desc->status & IRQ_DISABLED);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        spin_unlock_irq(&desc->lock);
         cpu_relax();
         goto retry;
     }
@@ -492,35 +644,37 @@ int pirq_guest_bind(struct vcpu *v, int 
         gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. "
                "Already at max share.\n", irq);
         rc = -EBUSY;
-        goto out;
+        goto unlock_out;
     }
 
     action->guest[action->nr_guests++] = v->domain;
 
+ unlock_out:
+    spin_unlock_irq(&desc->lock);
  out:
-    spin_unlock_irqrestore(&desc->lock, flags);
+    if ( newaction != NULL )
+        xfree(newaction);
     return rc;
 }
 
-int pirq_guest_unbind(struct domain *d, int irq)
+static irq_guest_action_t *__pirq_guest_unbind(
+    struct domain *d, int irq, irq_desc_t *desc)
 {
-    unsigned int        vector = irq_to_vector(irq);
-    irq_desc_t         *desc = &irq_desc[vector];
+    unsigned int        vector;
     irq_guest_action_t *action;
     cpumask_t           cpu_eoi_map;
-    unsigned long       flags;
     int                 i;
 
-    BUG_ON(vector == 0);
-
-    spin_lock_irqsave(&desc->lock, flags);
+    BUG_ON(!(desc->status & IRQ_GUEST));
 
     action = (irq_guest_action_t *)desc->action;
+    vector = desc - irq_desc;
 
-    i = 0;
-    while ( action->guest[i] && (action->guest[i] != d) )
-        i++;
-    memmove(&action->guest[i], &action->guest[i+1], IRQ_MAX_GUESTS-i-1);
+    for ( i = 0; (i < action->nr_guests) && (action->guest[i] != d); i++ )
+        continue;
+    BUG_ON(i == action->nr_guests);
+    memmove(&action->guest[i], &action->guest[i+1],
+            (action->nr_guests-i-1) * sizeof(action->guest[0]));
     action->nr_guests--;
 
     switch ( action->ack_type )
@@ -537,9 +691,9 @@ int pirq_guest_unbind(struct domain *d, 
              (action->nr_guests != 0) )
         {
             cpu_eoi_map = action->cpu_eoi_map;
-            spin_unlock_irqrestore(&desc->lock, flags);    
+            spin_unlock_irq(&desc->lock);
             on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 1, 0);
-            spin_lock_irqsave(&desc->lock, flags);
+            spin_lock_irq(&desc->lock);
         }
         break;
     }
@@ -551,7 +705,7 @@ int pirq_guest_unbind(struct domain *d, 
     BUG_ON(test_bit(irq, d->pirq_mask));
 
     if ( action->nr_guests != 0 )
-        goto out;
+        return NULL;
 
     BUG_ON(action->in_flight != 0);
 
@@ -571,21 +725,274 @@ int pirq_guest_unbind(struct domain *d, 
     if ( !cpus_empty(cpu_eoi_map) )
     {
         BUG_ON(action->ack_type != ACKTYPE_EOI);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        spin_unlock_irq(&desc->lock);
         on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 1, 1);
-        spin_lock_irqsave(&desc->lock, flags);
+        spin_lock_irq(&desc->lock);
     }
 
     BUG_ON(!cpus_empty(action->cpu_eoi_map));
 
     desc->action = NULL;
-    xfree(action);
     desc->status &= ~IRQ_GUEST;
+    desc->status &= ~IRQ_INPROGRESS;
     desc->handler->shutdown(vector);
 
+    /* Caller frees the old guest descriptor block. */
+    return action;
+}
+
+void pirq_guest_unbind(struct domain *d, int irq)
+{
+    irq_guest_action_t *oldaction = NULL;
+    irq_desc_t *desc;
+    int vector;
+
+    WARN_ON(!spin_is_locked(&d->event_lock));
+
+    BUG_ON(!local_irq_is_enabled());
+    desc = domain_spin_lock_irq_desc(d, irq, NULL);
+
+    if ( desc == NULL )
+    {
+        vector = -domain_irq_to_vector(d, irq);
+        BUG_ON(vector <= 0);
+        desc = &irq_desc[vector];
+        spin_lock_irq(&desc->lock);
+        d->arch.pirq_vector[irq] = d->arch.vector_pirq[vector] = 0;
+    }
+    else
+    {
+        oldaction = __pirq_guest_unbind(d, irq, desc);
+    }
+
+    spin_unlock_irq(&desc->lock);
+
+    if ( oldaction != NULL )
+        xfree(oldaction);
+}
+
+int pirq_guest_force_unbind(struct domain *d, int irq)
+{
+    irq_desc_t *desc;
+    irq_guest_action_t *action, *oldaction = NULL;
+    int i, bound = 0;
+
+    WARN_ON(!spin_is_locked(&d->event_lock));
+
+    BUG_ON(!local_irq_is_enabled());
+    desc = domain_spin_lock_irq_desc(d, irq, NULL);
+    BUG_ON(desc == NULL);
+
+    if ( !(desc->status & IRQ_GUEST) )
+        goto out;
+
+    action = (irq_guest_action_t *)desc->action;
+    for ( i = 0; (i < action->nr_guests) && (action->guest[i] != d); i++ )
+        continue;
+    if ( i == action->nr_guests )
+        goto out;
+
+    bound = 1;
+    oldaction = __pirq_guest_unbind(d, irq, desc);
+
  out:
-    spin_unlock_irqrestore(&desc->lock, flags);    
-    return 0;
+    spin_unlock_irq(&desc->lock);
+
+    if ( oldaction != NULL )
+        xfree(oldaction);
+
+    return bound;
+}
+
+int get_free_pirq(struct domain *d, int type, int index)
+{
+    int i;
+
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    if ( type == MAP_PIRQ_TYPE_GSI )
+    {
+        for ( i = 16; i < NR_IRQS; i++ )
+            if ( !d->arch.pirq_vector[i] )
+                break;
+        if ( i == NR_IRQS )
+            return -ENOSPC;
+    }
+    else
+    {
+        for ( i = NR_IRQS - 1; i >= 16; i-- )
+            if ( !d->arch.pirq_vector[i] )
+                break;
+        if ( i == 16 )
+            return -ENOSPC;
+    }
+
+    return i;
+}
+
+int map_domain_pirq(
+    struct domain *d, int pirq, int vector, int type, void *data)
+{
+    int ret = 0;
+    int old_vector, old_pirq;
+    irq_desc_t *desc;
+    unsigned long flags;
+    struct msi_desc *msi_desc;
+    struct pci_dev *pdev = NULL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    if ( !IS_PRIV(current->domain) )
+        return -EPERM;
+
+    if ( pirq < 0 || pirq >= NR_IRQS || vector < 0 || vector >= NR_VECTORS )
+    {
+        dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or vector %d\n",
+                d->domain_id, pirq, vector);
+        return -EINVAL;
+    }
+
+    old_vector = domain_irq_to_vector(d, pirq);
+    old_pirq = domain_vector_to_irq(d, vector);
+
+    if ( (old_vector && (old_vector != vector) ) ||
+         (old_pirq && (old_pirq != pirq)) )
+    {
+        dprintk(XENLOG_G_ERR, "dom%d: pirq %d or vector %d already mapped\n",
+                d->domain_id, pirq, vector);
+        return -EINVAL;
+    }
+
+    ret = irq_permit_access(d, pirq);
+    if ( ret )
+    {
+        dprintk(XENLOG_G_ERR, "dom%d: could not permit access to irq %d\n",
+                d->domain_id, pirq);
+        return ret;
+    }
+
+    desc = &irq_desc[vector];
+
+    if ( type == MAP_PIRQ_TYPE_MSI )
+    {
+        struct msi_info *msi = (struct msi_info *)data;
+
+        ret = -ENODEV;
+        if ( !cpu_has_apic )
+            goto done;
+
+        pdev = pci_get_pdev(msi->bus, msi->devfn);
+        ret = pci_enable_msi(msi, &msi_desc);
+        if ( ret )
+            goto done;
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        if ( desc->handler != &no_irq_type )
+            dprintk(XENLOG_G_ERR, "dom%d: vector %d in use\n",
+              d->domain_id, vector);
+        desc->handler = &pci_msi_type;
+        d->arch.pirq_vector[pirq] = vector;
+        d->arch.vector_pirq[vector] = pirq;
+        setup_msi_irq(pdev, msi_desc);
+        spin_unlock_irqrestore(&desc->lock, flags);
+    } else
+    {
+        spin_lock_irqsave(&desc->lock, flags);
+        d->arch.pirq_vector[pirq] = vector;
+        d->arch.vector_pirq[vector] = pirq;
+        spin_unlock_irqrestore(&desc->lock, flags);
+    }
+
+ done:
+    return ret;
+}
+
+/* The pirq should have been unbound before this call. */
+int unmap_domain_pirq(struct domain *d, int pirq)
+{
+    unsigned long flags;
+    irq_desc_t *desc;
+    int vector, ret = 0;
+    bool_t forced_unbind;
+    struct msi_desc *msi_desc = NULL;
+
+    if ( (pirq < 0) || (pirq >= NR_IRQS) )
+        return -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )
+        return -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    vector = domain_irq_to_vector(d, pirq);
+    if ( vector <= 0 )
+    {
+        dprintk(XENLOG_G_ERR, "dom%d: pirq %d not mapped\n",
+                d->domain_id, pirq);
+        ret = -EINVAL;
+        goto done;
+    }
+
+    forced_unbind = pirq_guest_force_unbind(d, pirq);
+    if ( forced_unbind )
+        dprintk(XENLOG_G_WARNING, "dom%d: forcing unbind of pirq %d\n",
+                d->domain_id, pirq);
+
+    desc = &irq_desc[vector];
+
+    if ( (msi_desc = desc->msi_desc) != NULL )
+        pci_disable_msi(msi_desc);
+
+    spin_lock_irqsave(&desc->lock, flags);
+
+    BUG_ON(vector != domain_irq_to_vector(d, pirq));
+
+    if ( msi_desc )
+        teardown_msi_vector(vector);
+
+    if ( desc->handler == &pci_msi_type )
+        desc->handler = &no_irq_type;
+
+    if ( !forced_unbind )
+    {
+        d->arch.pirq_vector[pirq] = 0;
+        d->arch.vector_pirq[vector] = 0;
+    }
+    else
+    {
+        d->arch.pirq_vector[pirq] = -vector;
+        d->arch.vector_pirq[vector] = -pirq;
+    }
+
+    spin_unlock_irqrestore(&desc->lock, flags);
+    if (msi_desc)
+        msi_free_vector(msi_desc);
+
+    ret = irq_deny_access(d, pirq);
+    if ( ret )
+        dprintk(XENLOG_G_ERR, "dom%d: could not deny access to irq %d\n",
+                d->domain_id, pirq);
+
+ done:
+    return ret;
+}
+
+void free_domain_pirqs(struct domain *d)
+{
+    int i;
+
+    spin_lock(&pcidevs_lock);
+    spin_lock(&d->event_lock);
+
+    for ( i = 0; i < NR_IRQS; i++ )
+        if ( d->arch.pirq_vector[i] > 0 )
+            unmap_domain_pirq(d, i);
+
+    spin_unlock(&d->event_lock);
+    spin_unlock(&pcidevs_lock);
 }
 
 extern void dump_ioapic_irq_info(void);
@@ -627,7 +1034,8 @@ static void dump_irqs(unsigned char key)
                        (test_bit(d->pirq_to_evtchn[irq],
                                  shared_info_addr(d, evtchn_pending)) ?
                         'P' : '-'),
-                       (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_GUEST_LONG(d),
+                       (test_bit(d->pirq_to_evtchn[irq] /
+				 BITS_PER_EVTCHN_WORD(d),
                                  vcpu_info_addr(d->vcpu[0], evtchn_pending_sel)) ?
                         'S' : '-'),
                        (test_bit(d->pirq_to_evtchn[irq],
diff -Naurp xen/arch/x86/machine_kexec.c xen-redhat/arch/x86/machine_kexec.c
--- xen/arch/x86/machine_kexec.c
+++ xen-redhat/arch/x86/machine_kexec.c
@@ -140,6 +140,20 @@ void machine_kexec(xen_kexec_image_t *im
     }
 }
 
+void arch_crash_save_vmcoreinfo(void)
+{
+    VMCOREINFO_SYMBOL(dom_xen);
+    VMCOREINFO_SYMBOL(dom_io);
+
+#ifdef CONFIG_X86_PAE
+    VMCOREINFO_SYMBOL_ALIAS(pgd_l3, idle_pg_table);
+#endif
+#ifdef CONFIG_X86_64
+    VMCOREINFO_SYMBOL_ALIAS(pgd_l4, idle_pg_table);
+#endif
+}
+
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/x86/Makefile xen-redhat/arch/x86/Makefile
--- xen/arch/x86/Makefile
+++ xen-redhat/arch/x86/Makefile
@@ -12,6 +12,7 @@ obj-y += apic.o
 obj-y += bitops.o
 obj-y += clear_page.o
 obj-y += compat.o
+obj-y += debug.o
 obj-y += delay.o
 obj-y += dmi_scan.o
 obj-y += domctl.o
@@ -24,6 +25,7 @@ obj-y += platform_hypercall.o
 obj-y += i387.o
 obj-y += i8259.o
 obj-y += io_apic.o
+obj-y += ioport_emulate.o
 obj-y += irq.o
 obj-y += microcode.o
 obj-y += mm.o
@@ -45,6 +47,8 @@ obj-y += usercopy.o
 obj-y += x86_emulate.o
 obj-y += machine_kexec.o
 obj-y += crash.o
+obj-y += pci.o
+obj-y += msi.o
 
 obj-$(crash_debug) += gdbstub.o
 
diff -Naurp xen/arch/x86/mm/hap/hap.c xen-redhat/arch/x86/mm/hap/hap.c
--- xen/arch/x86/mm/hap/hap.c
+++ xen-redhat/arch/x86/mm/hap/hap.c
@@ -61,7 +61,7 @@ int hap_enable_log_dirty(struct domain *
     hap_unlock(d);
 
     /* set l1e entries of P2M table to NOT_WRITABLE. */
-    p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER));
+    p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER));
     flush_tlb_mask(d->domain_dirty_cpumask);
     return 0;
 }
@@ -73,14 +73,14 @@ int hap_disable_log_dirty(struct domain 
     hap_unlock(d);
 
     /* set l1e entries of P2M table with normal mode */
-    p2m_set_flags_global(d, __PAGE_HYPERVISOR|_PAGE_USER);    
+    p2m_change_entry_type_global(d, (__PAGE_HYPERVISOR|_PAGE_USER));
     return 0;
 }
 
 void hap_clean_dirty_bitmap(struct domain *d)
 {
     /* mark physical memory as NOT_WRITEABLE and flush the TLB */
-    p2m_set_flags_global(d, (_PAGE_PRESENT|_PAGE_USER));
+    p2m_change_entry_type_global(d, (_PAGE_PRESENT|_PAGE_USER));
     flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
@@ -593,6 +593,7 @@ int hap_invlpg(struct vcpu *v, unsigned 
  */
 void hap_update_cr3(struct vcpu *v, int do_locking)
 {
+   hvm_update_guest_cr(v, 3);
 }
 
 void hap_update_paging_modes(struct vcpu *v)
@@ -626,8 +627,11 @@ void hap_update_paging_modes(struct vcpu
         mfn_t mmfn = hap_make_monitor_table(v);
         v->arch.monitor_table = pagetable_from_mfn(mmfn);
         make_cr3(v, mfn_x(mmfn));
+        hvm_update_host_cr3(v);
     }
 
+    hap_update_cr3(v, 1);
+
     hap_unlock(d);
 }
 
@@ -674,9 +678,16 @@ void 
 hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
                     mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
 {
+    uint32_t old_flags;
+
     hap_lock(v->domain);
 
+    old_flags = l1e_get_flags(*p);
     safe_write_pte(p, new);
+    if ( (old_flags & _PAGE_PRESENT)
+         && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
+             flush_tlb_mask(v->domain->domain_dirty_cpumask);
+
 #if CONFIG_PAGING_LEVELS == 3
     /* install P2M in monitor table for PAE Xen */
     if ( level == 3 ) 
diff -Naurp xen/arch/x86/mm/hap/support.c xen-redhat/arch/x86/mm/hap/support.c
--- xen/arch/x86/mm/hap/support.c
+++ xen-redhat/arch/x86/mm/hap/support.c
@@ -65,7 +65,7 @@ unsigned long hap_gva_to_gfn_protected_m
 
     gpfn = (gcr3 >> PAGE_SHIFT);
     for ( lev = mode; lev >= 1; lev-- ) {
-        mfn = get_mfn_from_gpfn( gpfn );
+        mfn = gmfn_to_mfn(v->domain, gpfn);
         if ( mfn == INVALID_MFN ) {
             HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, 
                        lev);
@@ -148,7 +148,7 @@ unsigned long hap_gva_to_gfn_pae_mode(st
 
     gpfn = (gcr3 >> PAGE_SHIFT);
     for ( lev = mode; lev >= 1; lev-- ) {
-        mfn = get_mfn_from_gpfn( gpfn );
+        mfn = gmfn_to_mfn(v->domain, gpfn);
         if ( mfn == INVALID_MFN ) {
             HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, 
                        lev);
@@ -242,7 +242,7 @@ unsigned long hap_gva_to_gfn_long_mode(s
 
     gpfn = (gcr3 >> PAGE_SHIFT);
     for ( lev = mode; lev >= 1; lev-- ) {
-        mfn = get_mfn_from_gpfn( gpfn );
+        mfn = gmfn_to_mfn(v->domain, gpfn);
         if ( mfn == INVALID_MFN ) {
             HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, 
                        lev);
diff -Naurp xen/arch/x86/mm/Makefile xen-redhat/arch/x86/mm/Makefile
--- xen/arch/x86/mm/Makefile
+++ xen-redhat/arch/x86/mm/Makefile
@@ -3,3 +3,4 @@ subdir-y += hap
 
 obj-y += paging.o
 obj-y += p2m.o
+obj-y += p2m-ept.o
diff -Naurp xen/arch/x86/mm/p2m.c xen-redhat/arch/x86/mm/p2m.c
--- xen/arch/x86/mm/p2m.c
+++ xen-redhat/arch/x86/mm/p2m.c
@@ -27,11 +27,16 @@
 #include <asm/page.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
+#include <xen/iommu.h>
 
 /* Debugging and auditing of the P2M code? */
 #define P2M_AUDIT     0
 #define P2M_DEBUGGING 1
 
+static int opt_hap_1gb = 0;
+boolean_param("hap_1gb", opt_hap_1gb);
+
 /*
  * The P2M lock.  This protects all updates to the p2m table.
  * Updates are expected to be safe against concurrent reads, 
@@ -47,6 +52,9 @@
         (_d)->arch.p2m.locker_function = "nobody";   \
     } while (0)
 
+#define p2m_locked_by_me(_d)                     \
+    (current->processor == (_d)->arch.p2m.locker)
+
 #define p2m_lock(_d)                                                \
     do {                                                            \
         if ( unlikely((_d)->arch.p2m.locker == current->processor) )\
@@ -92,8 +100,6 @@
 #undef page_to_mfn
 #define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
 
-
-
 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
 //
@@ -123,9 +129,11 @@ p2m_next_level(struct domain *d, mfn_t *
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
                u32 max, unsigned long type)
 {
+    l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
+    int i;
     ASSERT(d->arch.p2m.alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
@@ -140,10 +148,8 @@ p2m_next_level(struct domain *d, mfn_t *
         list_add_tail(&pg->list, &d->arch.p2m.pages);
         pg->u.inuse.type_info = type | 1 | PGT_validated;
         pg->count_info = 1;
-
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER);
-
         switch ( type ) {
         case PGT_l3_page_table:
             paging_write_p2m_entry(d, gfn, 
@@ -166,6 +172,70 @@ p2m_next_level(struct domain *d, mfn_t *
             break;
         }
     }
+
+    /* split 1GB pages into 2MB pages */
+    if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m.alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m.pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1| PGT_validated;
+        pg->count_info = 1;
+
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
+            paging_write_p2m_entry(d, gfn, l1_entry+i, *table_mfn, new_entry,
+                                   2);
+        }
+        unmap_domain_page(l1_entry);
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 3);
+    }
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m.alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m.pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
+        pg->count_info = 1;
+        
+        /* New splintered mappings inherit the flags of the old superpage,
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            paging_write_p2m_entry(d, gfn,
+                                   l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 2);
+    }
+    
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
     next = map_domain_page(mfn_x(*table_mfn));
     unmap_domain_page(*table);
@@ -176,7 +246,8 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, u32 l1e_flags)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                int order, u32 l1e_flags)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -184,7 +255,12 @@ set_p2m_entry(struct domain *d, unsigned
     unsigned long gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
+    p2m_type_t p2mt = p2m_flags_to_type(l1e_flags);
     int rv=0;
+#if CONFIG_PAGING_LEVELS >= 3
+    l3_pgentry_t l3e_content;
+#endif
 
 #if CONFIG_PAGING_LEVELS >= 4
     if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
@@ -193,40 +269,104 @@ set_p2m_entry(struct domain *d, unsigned
         goto out;
 #endif
 #if CONFIG_PAGING_LEVELS >= 3
-    // When using PAE Xen, we only allow 33 bits of pseudo-physical
-    // address in translated guests (i.e. 8 GBytes).  This restriction
-    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
-    // in Xen's address space for translated PV guests.
-    //
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         (CONFIG_PAGING_LEVELS == 3
-                          ? 8
-                          : L3_PAGETABLE_ENTRIES),
-                         PGT_l2_page_table) )
+    /* Try to allocate 1GB page table if this feature is supported.
+     * When using PAE Xen, we only allow 33 bits of pseudo-physical
+     * address in translated guests (i.e. 8 GBytes).  This restriction
+     * comes from wanting to map the P2M table into the 16MB RO_MPT hole
+     * in Xen's address space for translated PV guests.
+     * When using AMD's NPT on PAE Xen, we are restricted to 4GB.
+     */
+    if ( order == 18 ) /* 1GB page */
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L3_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table L3 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
+
+        if ( mfn_valid(mfn) )
+            l3e_content = l3e_from_pfn(mfn_x(mfn),
+                                       __PAGE_HYPERVISOR|_PAGE_USER|_PAGE_PSE);
+        else
+            l3e_content = l3e_empty();
+
+        entry_content.l1 = l3e_content.l3;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 3);
+    }
+    else if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                              L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                              ((CONFIG_PAGING_LEVELS == 3)
+                               ? (hvm_funcs.hap_supported ? 4 : 8)
+                               : L3_PAGETABLE_ENTRIES),
+                              PGT_l2_page_table) )
         goto out;
 #endif
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-        goto out;
+    
+    if ( order == 0 ) 
+    {
+        if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+        
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || p2mt == p2m_mmio_direct )
+            entry_content = l1e_from_pfn(mfn_x(mfn), l1e_flags);
+        else
+            entry_content = l1e_empty();
+        
+        /* level 1 entry */
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
+    }
+    else if ( order == 9 )
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
 
-    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                               0, L1_PAGETABLE_ENTRIES);
-    ASSERT(p2m_entry);
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       __PAGE_HYPERVISOR|_PAGE_USER|_PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+
+        entry_content.l1 = l2e_content.l2;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
+    }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) ) 
-        d->arch.p2m.max_mapped_pfn = gfn;
-
-    if ( mfn_valid(mfn) )
-        entry_content = l1e_from_pfn(mfn_x(mfn), l1e_flags);
-    else
-        entry_content = l1e_empty();
-
-    /* level 1 entry */
-    paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
-
+        d->arch.p2m.max_mapped_pfn = gfn + (1UL << order) - 1;
+ 
+     if ( iommu_enabled )
+     {
+         int i;
+         if ( p2mt == p2m_ram_rw )
+             for ( i = 0; i < (1UL << order); i++ )
+                 iommu_map_page(d, gfn+i, mfn_x(mfn)+i );
+         else
+             for ( i = 0; i < (1UL << order); i++ )
+                 iommu_unmap_page(d, gfn+i);
+     }
+ 
     /* Success */
     rv = 1;
  
@@ -235,15 +375,60 @@ set_p2m_entry(struct domain *d, unsigned
     return rv;
 }
 
+static mfn_t
+p2m_gfn_to_mfn(struct domain *d, unsigned long gfn);
 
 /* Init the datastructures for later use by the p2m code */
 void p2m_init(struct domain *d)
 {
     p2m_lock_init(d);
     INIT_LIST_HEAD(&d->arch.p2m.pages);
+
+    d->arch.p2m.set_entry = p2m_set_entry;
+    d->arch.p2m.get_entry = p2m_gfn_to_mfn;
+    d->arch.p2m.get_entry_fast = p2m_gfn_to_mfn_fast;
+    d->arch.p2m.change_entry_type_global = p2m_set_flags_global;
+
+    if ( is_hvm_domain(d) && hap_enabled(d) &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+        ept_p2m_init(d);
+}
+
+void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags)
+{
+    p2m_lock(d);
+    d->arch.p2m.change_entry_type_global(d, l1e_flags);
+    p2m_unlock(d);
 }
 
+static inline
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                    int page_order, u32 l1e_flags)
+{
+    unsigned long todo = 1ul << page_order;
+    unsigned int order;
+    int rc = 0;
+
+    while ( todo )
+    {
+        /* decide which page mode to use */
+	if ( hap_enabled(d) )
+		order = ( (((gfn | mfn_x(mfn) | todo) & ((1ul << 18) - 1)) == 0) &&
+		hap_1gb_pgtb(d) && opt_hap_1gb ) ? 18 :
+		(((gfn | mfn_x(mfn) | todo) & ((1ul << 9) - 1)) == 0) ? 9 : 0;
+	else
+		order = 0;
+
+        rc = d->arch.p2m.set_entry(d, gfn, mfn, order, l1e_flags);
+        gfn += 1ul << order;
+        if ( mfn_x(mfn) != INVALID_MFN )
+            mfn = _mfn(mfn_x(mfn) + (1ul << order));
+        todo -= 1ul << order;
+    }
 
+    return rc;
+}
+ 
 // Allocate a new p2m table for a domain.
 //
 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
@@ -305,7 +490,7 @@ int p2m_alloc_table(struct domain *d,
     /* Initialise physmap tables for slot zero. Other code assumes this. */
     gfn = 0;
     mfn = _mfn(INVALID_MFN);
-    if ( !set_p2m_entry(d, gfn, mfn, __PAGE_HYPERVISOR|_PAGE_USER) )
+    if ( !set_p2m_entry(d, gfn, mfn, 0, __PAGE_HYPERVISOR|_PAGE_USER) )
         goto error;
 
     for ( entry = d->page_list.next;
@@ -323,7 +508,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-             && !set_p2m_entry(d, gfn, mfn, __PAGE_HYPERVISOR|_PAGE_USER) )
+             && !set_p2m_entry(d, gfn, mfn, 0, __PAGE_HYPERVISOR|_PAGE_USER) )
             goto error;
     }
 
@@ -358,7 +543,7 @@ void p2m_teardown(struct domain *d)
 }
 
 mfn_t
-gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+p2m_gfn_to_mfn(struct domain *d, unsigned long gpfn)
 /* Read another domain's p2m entries */
 {
     mfn_t mfn;
@@ -405,6 +590,14 @@ gfn_to_mfn_foreign(struct domain *d, uns
             unmap_domain_page(l3e);
             return _mfn(INVALID_MFN);
         }
+        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+        {
+            mfn = _mfn(l3e_get_pfn(*l3e) +
+                       l2_table_offset(addr) * L2_PAGETABLE_ENTRIES +
+                       l1_table_offset(addr));
+            unmap_domain_page(l3e);
+            return mfn_valid(mfn) ? mfn : _mfn(INVALID_MFN);
+        }
         mfn = _mfn(l3e_get_pfn(*l3e));
         unmap_domain_page(l3e);
     }
@@ -417,6 +610,14 @@ gfn_to_mfn_foreign(struct domain *d, uns
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        unmap_domain_page(l2e);
+        
+        return mfn_valid(mfn) ? mfn : _mfn(INVALID_MFN);
+    }
+    
     mfn = _mfn(l2e_get_pfn(*l2e));
     unmap_domain_page(l2e);
 
@@ -504,7 +705,7 @@ static void audit_p2m(struct domain *d)
             /* This m2p entry is stale: the domain has another frame in
              * this physical slot.  No great disaster, but for neatness,
              * blow away the m2p entry. */ 
-            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY, __PAGE_HYPERVISOR|_PAGE_USER);
+            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
         }
 
         if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) )
@@ -562,6 +763,30 @@ static void audit_p2m(struct domain *d)
                     gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
                     continue;
                 }
+
+                /* check for super page */
+                if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE )
+                {
+                    mfn = l3e_get_pfn(l3e[i3]);
+                    ASSERT(mfn_valid(_mfn(mfn)));
+                    /* we have to cover 512x512 4K pages */
+                    for ( i2 = 0;
+                          i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES);
+                          i2++)
+                    {
+                        m2pfn = get_gpfn_from_mfn(mfn+i2);
+                        if ( m2pfn != (gfn + i2) )
+                        {
+                            pmbad++;
+                            P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                       " -> gfn %#lx\n", gfn+i2, mfn+i2,
+                                       m2pfn);
+                            BUG();
+                        }
+                    }
+                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                    continue;
+                }
                 l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
 #endif /* all levels... */
                 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
@@ -571,6 +796,29 @@ static void audit_p2m(struct domain *d)
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
+
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            if ( m2pfn != (gfn + i) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i, mfn+i,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
                     l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
                     
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
@@ -624,8 +872,10 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+p2m_remove_page(struct domain *d, unsigned long gfn,
+                    unsigned long mfn, int order)
 {
+    unsigned long i;
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
@@ -633,30 +883,50 @@ p2m_remove_page(struct domain *d, unsign
     ASSERT(mfn_x(gfn_to_mfn(d, gfn)) == mfn);
     //ASSERT(mfn_to_gfn(d, mfn) == gfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), __PAGE_HYPERVISOR|_PAGE_USER);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order,
+                         __PAGE_HYPERVISOR|_PAGE_USER);
+    for ( i = 0; i < (1UL << order); i++ )
+        set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, int order)
 {
     p2m_lock(d);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    p2m_remove_page(d, gfn, mfn, order);
     audit_p2m(d);
     p2m_unlock(d);    
 }
 
-void
-guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                       unsigned long mfn)
+int
+guest_physmap_add_entry(struct domain *d, unsigned long gfn,
+                       unsigned long mfn, int order, u32 l1e_flags)
 {
-    unsigned long ogfn;
+    unsigned long ogfn, i;
     mfn_t omfn;
+    int rc = 0;
 
     if ( !paging_mode_translate(d) )
-        return;
+        return -EINVAL;
+
+#if CONFIG_PAGING_LEVELS == 3
+    /*
+     * 32bit AMD nested paging does not support over 4GB guest due to 
+     * hardware translation limit. This limitation is checked by comparing
+     * gfn with 0xfffffUL.
+     */
+    if ( paging_mode_hap(d) && (gfn > 0xfffffUL) &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+    {
+        if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) )
+            dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
+                    " 4GB: remove 'hap' Xen boot parameter.\n",
+                    d->domain_id);
+        return -EINVAL;
+    }
+#endif
 
     p2m_lock(d);
     audit_p2m(d);
@@ -666,8 +936,11 @@ guest_physmap_add_page(struct domain *d,
     omfn = gfn_to_mfn(d, gfn);
     if ( mfn_valid(omfn) )
     {
-        set_p2m_entry(d, gfn, _mfn(INVALID_MFN), __PAGE_HYPERVISOR|_PAGE_USER);
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, l1e_flags) )
+            rc = -EINVAL;
+
+        for ( i = 0; i < (1UL << order); i++)
+            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
     }
 
     ogfn = mfn_to_gfn(d, _mfn(mfn));
@@ -688,15 +961,34 @@ guest_physmap_add_page(struct domain *d,
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n", 
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn ) 
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, order);
         }
     }
 
-    set_p2m_entry(d, gfn, _mfn(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
-    set_gpfn_from_mfn(mfn, gfn);
+    if ( !set_p2m_entry(d, gfn, _mfn(mfn), order, l1e_flags) )
+        rc = -EINVAL;
+
+    for ( i = 0; i < (1UL << order); i++ )
+        set_gpfn_from_mfn(mfn+i, gfn+i);
 
     audit_p2m(d);
     p2m_unlock(d);
+
+    return rc;
+}
+
+int
+guest_physmap_add_page(struct domain *d, unsigned long gfn,
+                       unsigned long mfn, int order)
+{
+    int ret = 0;
+
+
+    ret = guest_physmap_add_entry(d, gfn, mfn, order,
+                                  __PAGE_HYPERVISOR | _PAGE_USER);
+
+    /* TODO: fix exit path when failure */
+    return ret;
 }
 
 /* This function goes through P2M table and modify l1e flags of all pages. Note
@@ -706,15 +998,16 @@ guest_physmap_add_page(struct domain *d,
  */
 void p2m_set_flags_global(struct domain *d, u32 l1e_flags)
 {
-    unsigned long mfn, gfn;
+    unsigned long mfn, gfn, flags;
     l1_pgentry_t l1e_content;
     l1_pgentry_t *l1e;
     l2_pgentry_t *l2e;
-    mfn_t l1mfn;
+    mfn_t l1mfn, l2mfn;
     int i1, i2;
 #if CONFIG_PAGING_LEVELS >= 3
     l3_pgentry_t *l3e;
     int i3;
+    mfn_t l3mfn;
 #if CONFIG_PAGING_LEVELS == 4
     l4_pgentry_t *l4e;
     int i4;
@@ -727,13 +1020,15 @@ void p2m_set_flags_global(struct domain 
     if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
         return;
 
-    p2m_lock(d);
-        
+    ASSERT(p2m_locked_by_me(d));
+
 #if CONFIG_PAGING_LEVELS == 4
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
 #elif CONFIG_PAGING_LEVELS == 3
+    l3mfn = _mfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
 #else /* CONFIG_PAGING_LEVELS == 2 */
+    l2mfn = _mfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     l2e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
 #endif
 
@@ -745,6 +1040,7 @@ void p2m_set_flags_global(struct domain 
 	{
 	    continue;
 	}
+        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
 	l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
 #endif /* now at levels 3 or 4... */
 	for ( i3 = 0; 
@@ -755,6 +1051,19 @@ void p2m_set_flags_global(struct domain 
 	    {
 		continue;
 	    }
+	    if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
+	    {
+		flags = l3e_get_flags(l3e[i3]);
+		mfn = l3e_get_pfn(l3e[i3]);
+		gfn = get_gpfn_from_mfn(mfn);
+		flags = l1e_flags;
+		l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+		paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l3e[i3],
+				       l3mfn, l1e_content, 3);
+		continue;
+	    }
+
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
 	    l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
 #endif /* all levels... */
 	    for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
@@ -764,6 +1073,18 @@ void p2m_set_flags_global(struct domain 
 		    continue;
 		}
 
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = l1e_flags;
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
+                                           l2mfn, l1e_content, 2);
+                    continue;
+                }
+
                 l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
 		l1e = map_domain_page(mfn_x(l1mfn));
 		
@@ -797,7 +1118,6 @@ void p2m_set_flags_global(struct domain 
     unmap_domain_page(l2e);
 #endif
 
-    p2m_unlock(d);
 }
 
 /* This function traces through P2M table and modifies l1e flags of a specific
@@ -813,13 +1133,56 @@ int p2m_set_flags(struct domain *d, padd
     gfn = gpa >> PAGE_SHIFT;
     mfn = gfn_to_mfn(d, gfn);
     if ( mfn_valid(mfn) )
-        set_p2m_entry(d, gfn, mfn, l1e_flags);
+        set_p2m_entry(d, gfn, mfn, 0, l1e_flags);
     
     p2m_unlock(d);
 
     return 1;
 }
 
+int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+    int rc = 0;
+    unsigned long omfn;
+
+    if ( !paging_mode_translate(d) )
+        return 0;
+
+    omfn = gmfn_to_mfn(d, gfn);
+    if ( INVALID_MFN != omfn )
+    {
+        ASSERT(mfn_valid(_mfn(omfn)));
+        set_gpfn_from_mfn(omfn, INVALID_M2P_ENTRY);
+    }
+
+    rc = set_p2m_entry(d, gfn, mfn, 0, p2m_type_to_flags(p2m_mmio_direct));
+    if ( 0 == rc )
+        gdprintk(XENLOG_ERR,
+            "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
+            gmfn_to_mfn(d, gfn));
+    return rc;
+}
+
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn)
+{
+    int rc = 0;
+    unsigned long mfn;
+
+    if ( !paging_mode_translate(d) )
+        return 0;
+
+    mfn = gmfn_to_mfn(d, gfn);
+    if ( INVALID_MFN == mfn )
+    {
+        gdprintk(XENLOG_ERR,
+            "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
+        return 0;
+    }
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0, 0);
+
+    return rc;
+}
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/x86/mm/p2m-ept.c xen-redhat/arch/x86/mm/p2m-ept.c
--- xen/arch/x86/mm/p2m-ept.c
+++ xen-redhat/arch/x86/mm/p2m-ept.c
@@ -0,0 +1,570 @@
+/*
+ * p2m-ept.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <xen/iocap.h>
+#include <asm/mtrr.h>
+
+#if 1 /* XEN_VERSION == 3 && XEN_SUBVERSION < 2 */
+
+static int ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                                int order, p2m_type_t p2mt);
+mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t);
+static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t);
+
+static p2m_type_t ept_flags_to_p2m_type(u32 l1e_flags)
+{
+    if ( l1e_flags & _PAGE_RW )
+        return p2m_ram_rw;
+    else if ( paging_mode_log_dirty(current->domain) )
+        return p2m_ram_logdirty;
+    return p2m_invalid;
+}
+
+static inline int
+compat_ept_set_entry(struct domain *d, unsigned long gfn,
+  mfn_t mfn, int order, u32 l1e_flags)
+{
+    p2m_type_t t = ept_flags_to_p2m_type(l1e_flags);
+    if ( t == p2m_ram_rw && mfn_x(mfn) != INVALID_MFN &&
+         iomem_access_permitted(d, mfn_x(mfn), mfn_x(mfn)) )
+        t = p2m_mmio_direct;
+
+    return ept_set_entry(d, gfn, mfn, order, t);
+}
+
+static mfn_t compat_ept_get_entry(struct domain *d, unsigned long gfn)
+{
+    p2m_type_t dummy;
+    return ept_get_entry(d, gfn, &dummy);
+}
+
+static mfn_t compat_ept_get_entry_fast(unsigned long gfn)
+{
+    p2m_type_t dummy;
+    return ept_get_entry_fast(gfn, &dummy);
+}
+#else
+
+#define compat_ept_set_entry ept_set_entry
+#define compat_ept_get_entry ept_get_entry
+#define compat_ept_get_entry_fast 
+
+#endif
+
+uint8_t epte_get_entry_emt(
+    struct domain *d, unsigned long gfn,
+    unsigned long mfn, uint8_t *igmt, int direct_mmio)
+{
+    struct vcpu *v = current;
+
+    *igmt = 0;
+
+    if ( (current->domain != d) && ((v = d->vcpu[0]) == NULL) )
+        return MTRR_TYPE_WRBACK;
+
+    if ( !mfn_valid(mfn) )
+        return MTRR_TYPE_UNCACHABLE;
+
+    if ( !iommu_enabled )
+    {
+        *igmt = 1;
+        return MTRR_TYPE_WRBACK;
+    }
+
+    if ( direct_mmio )
+        return MTRR_TYPE_UNCACHABLE;
+
+    if ( iommu_snoop )
+    {
+        *igmt = 1;
+        return MTRR_TYPE_WRBACK;
+    }
+
+    return MTRR_TYPE_WRBACK; 
+}
+
+static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type)
+{
+    switch(type)
+    {
+        case p2m_ram_rw:
+             entry->r = entry->w = entry->x = 1;
+            return;
+        case p2m_mmio_direct:
+             entry->r = entry->w = entry->x = 1;
+            return;
+        case p2m_ram_logdirty:
+        case p2m_ram_ro:
+             entry->r = entry->x = 1;
+             entry->w = 0;
+            return;
+        case p2m_invalid:
+        case p2m_mmio_dm:
+        default:
+            return;
+    }
+}
+
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+#define GUEST_TABLE_SPLIT_PAGE  3
+
+static struct page_info *ept_alloc_middle_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    pg = d->arch.p2m.alloc_page(d);
+    if ( pg == NULL )
+        return NULL;
+
+    pg->count_info = 1;
+    pg->u.inuse.type_info = 1 | PGT_validated;
+    list_add_tail(&pg->list, &d->arch.p2m.pages);
+
+    return pg;
+}
+
+static ept_entry_t ept_set_middle_entry(ept_entry_t *ept_entry,
+                                        struct page_info *pg)
+{
+    ept_entry_t e;
+
+    e.epte = 0;
+    e.mfn = page_to_mfn(pg);
+    /* last step */
+    e.r = e.w = e.x = 1;
+    *ept_entry = e;
+    return e;
+}
+
+static int ept_next_level(struct domain *d, bool_t read_only,
+                          ept_entry_t **table, unsigned long *gfn_remainder,
+                          u32 shift, int order)
+{
+    ept_entry_t ept_entry, *next;
+    struct page_info *pg;
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+
+    ept_entry = (*table)[index];
+
+    if ( !(ept_entry.epte & 0x7) )
+    {
+        if ( read_only )
+            return 0;
+
+        pg = ept_alloc_middle_page(d);
+        if ( pg == NULL )
+            return 0;
+        ept_entry = ept_set_middle_entry((*table) + index, pg);
+    }
+
+    if ( !ept_entry.sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry.mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+    {
+        if ( order == shift || read_only )
+            return GUEST_TABLE_SUPER_PAGE;
+        else
+            return GUEST_TABLE_SPLIT_PAGE;
+    }
+}
+
+static int
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                     int order, p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset=0;
+    ept_entry_t ept_entry;
+    u32 index;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
+    int direct_mmio = (p2mt == p2m_mmio_direct);
+    uint8_t igmt = 0; 
+    int need_modify_vtd_table = 1;
+
+    /* We only support 4k and 2m pages now */
+
+    BUG_ON(order && order != EPT_TABLE_ORDER);
+
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table,
+                    &gfn_remainder, i * EPT_TABLE_ORDER, order);
+        if ( !ret )
+            goto out;
+        else if ( ret != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
+    ept_entry = table[index];
+
+    if ( ret != GUEST_TABLE_SPLIT_PAGE )
+    {
+        if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
+        {
+            ept_entry.emt = epte_get_entry_emt(d, gfn, mfn_x(mfn),
+                                &igmt, direct_mmio);
+            ept_entry.igmt = igmt;
+            ept_entry.sp_avail = walk_level ? 1 : 0;
+
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                if ( ept_entry.mfn == (mfn_x(mfn) - offset) )
+                    need_modify_vtd_table = 0;
+                else
+                    ept_entry.mfn = mfn_x(mfn) - offset;
+
+                if ( ept_entry.avail1 == p2m_ram_logdirty &&
+                  p2mt == p2m_ram_rw )
+                    for ( i = 0; i < (1UL << order); i++ )
+                        paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+            }
+            else
+            {
+                if ( ept_entry.mfn == mfn_x(mfn) )
+                    need_modify_vtd_table = 0;
+                else
+                    ept_entry.mfn = mfn_x(mfn);
+            }
+
+
+            ept_entry.avail1 = p2mt;
+            ept_entry.rsvd = 0;
+            ept_entry.avail2 = 0;
+            /* last step */
+            ept_entry.r = ept_entry.w = ept_entry.x = 1;
+            ept_p2m_type_to_flags(&ept_entry, p2mt);
+        }
+        else
+            ept_entry.epte = 0;
+
+        table[index] = ept_entry;
+    }
+    else
+    {
+        /* It's super page before, now set one of the 4k pages, so
+         * we should split the 2m page to 4k pages now.
+         */
+
+        ept_entry_t *split_table = NULL;
+        ept_entry_t split_ept_entry;
+        unsigned long split_mfn = ept_entry.mfn;
+        p2m_type_t split_p2mt = ept_entry.avail1;
+        struct page_info *pg;
+
+        /* alloc new page for new ept middle level entry which is
+         * before a leaf super entry
+         */
+
+        pg = ept_alloc_middle_page(d);
+        if ( pg == NULL )
+            goto out;
+
+        /* split the super page before to 4k pages */
+
+        split_table = map_domain_page(page_to_mfn(pg));
+        offset = gfn & ((1 << EPT_TABLE_ORDER) - 1);
+
+        for ( i = 0; i < 512; i++ )
+        {
+            split_ept_entry = split_table[i];
+            split_ept_entry.emt = epte_get_entry_emt(d,
+                                        gfn-offset+i, split_mfn+i,
+                                        &igmt, direct_mmio);
+            split_ept_entry.igmt = igmt;
+
+            split_ept_entry.sp_avail =  0;
+
+            split_ept_entry.mfn = split_mfn+i;
+
+            split_ept_entry.avail1 = split_p2mt;
+            split_ept_entry.rsvd = 0;
+            split_ept_entry.avail2 = 0;
+            /* last step */
+            split_ept_entry.r = split_ept_entry.w = split_ept_entry.x = 1;
+            ept_p2m_type_to_flags(&split_ept_entry, split_p2mt);
+            split_table[i] = split_ept_entry;
+        }
+
+        /* Set the destinated 4k page as normal */
+        split_ept_entry = split_table[offset];
+        split_ept_entry.emt = epte_get_entry_emt(d, gfn, mfn_x(mfn),
+                                                &igmt, direct_mmio);
+        split_ept_entry.igmt = igmt;
+        if ( split_ept_entry.mfn == mfn_x(mfn) )
+            need_modify_vtd_table = 0;
+        else
+            split_ept_entry.mfn = mfn_x(mfn);
+
+        split_ept_entry.avail1 = p2mt;
+        ept_p2m_type_to_flags(&split_ept_entry, p2mt);
+        split_table[offset] = split_ept_entry;
+
+        unmap_domain_page(split_table);
+
+        ept_set_middle_entry(table + index, pg);
+    }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn))
+         && (gfn + (1UL << order) - 1 > d->arch.p2m.max_mapped_pfn) )
+        d->arch.p2m.max_mapped_pfn = gfn + (1UL << order) - 1;
+
+    /* Success */
+    rv = 1;
+
+ out:
+    unmap_domain_page(table);
+    ept_sync_domain(d);
+
+    /* support pci pass-through */
+    if ( iommu_enabled && is_hvm_domain(d) 
+            && need_modify_vtd_table)
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( order == EPT_TABLE_ORDER )
+            {
+                for ( i = 0; i < ( 1 << order ); i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( !order )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( order == EPT_TABLE_ORDER )
+            {
+                for ( i = 0; i < ( 1 << order ); i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( !order )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+    return rv;
+}
+
+/* Read ept p2m entries */
+mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t ept_entry;
+    u32 index;
+    int i, ret=0;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    *t = p2m_mmio_dm;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > d->arch.p2m.max_mapped_pfn )
+        goto out;
+
+    /* should check if gfn obeys GAW here */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                                        i * EPT_TABLE_ORDER, 0);
+        if ( !ret )
+            goto out;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
+    ept_entry = table[index];
+
+    if ( ept_entry.avail1 != p2m_invalid )
+    {
+        *t = ept_entry.avail1;
+        mfn = _mfn(ept_entry.mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
+    }
+
+ out:
+    unmap_domain_page(table);
+    return mfn;
+}
+
+static mfn_t ept_get_entry_fast(unsigned long gfn, p2m_type_t *t)
+{
+    return ept_get_entry(current->domain, gfn, t);
+}
+
+/* Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to
+ * quickly enable or diable log-dirty tracking */
+
+static void ept_change_entry_type_global(struct domain *d,
+                                         p2m_type_t ot, p2m_type_t nt)
+{
+    ept_entry_t *l4e, *l3e, *l2e, *l1e;
+    int i4, i3, i2, i1;
+
+    if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
+        return;
+
+    BUG_ON(EPT_DEFAULT_GAW != 3);
+
+    l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
+    {
+        if ( !l4e[i4].epte )
+            continue;
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
+                continue;
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
+    }
+    unmap_domain_page(l4e);
+
+    ept_sync_domain(d);
+}
+
+static void __ept_change_entry_type_global(struct domain *d,
+                                         u32 l1e_flags)
+{
+    p2m_type_t nt,ot;
+
+    if ( l1e_flags  == (__PAGE_HYPERVISOR|_PAGE_USER) )
+    {
+        nt = p2m_ram_rw;
+        ot = p2m_ram_logdirty;
+    }
+    else if ( l1e_flags == (_PAGE_PRESENT|_PAGE_USER) )
+    {
+        nt = p2m_ram_logdirty;
+        ot = p2m_ram_rw;
+    }
+    else
+    {
+        nt = ot = p2m_ram_rw;
+        BUG();
+    }
+
+    ept_change_entry_type_global(d, ot, nt);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+    d->arch.p2m.set_entry = compat_ept_set_entry;
+    d->arch.p2m.get_entry = compat_ept_get_entry;
+    d->arch.p2m.get_entry_fast = compat_ept_get_entry_fast;
+    d->arch.p2m.change_entry_type_global = __ept_change_entry_type_global;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -Naurp xen/arch/x86/mm/paging.c xen-redhat/arch/x86/mm/paging.c
--- xen/arch/x86/mm/paging.c
+++ xen-redhat/arch/x86/mm/paging.c
@@ -27,10 +27,6 @@
 #include <asm/hap.h>
 #include <asm/guest_access.h>
 
-/* Xen command-line option to enable hardware-assisted paging */
-int opt_hap_enabled;
-boolean_param("hap", opt_hap_enabled);
-
 /* Printouts */
 #define PAGING_PRINTK(_f, _a...)                                     \
     debugtrace_printk("pg: %s(): " _f, __func__, ##_a)
@@ -362,14 +358,14 @@ void paging_domain_init(struct domain *d
     shadow_domain_init(d);
 
     /* ... but we will use hardware assistance if it's available. */
-    if ( opt_hap_enabled && is_hvm_domain(d) )
+    if ( hap_enabled(d) )
         hap_domain_init(d);
 }
 
 /* vcpu paging struct initialization goes here */
 void paging_vcpu_init(struct vcpu *v)
 {
-    if ( opt_hap_enabled && is_hvm_vcpu(v) )
+    if ( hap_enabled(v->domain) )
         hap_vcpu_init(v);
     else
         shadow_vcpu_init(v);
@@ -429,7 +425,7 @@ int paging_domctl(struct domain *d, xen_
     }
 	
     /* Here, dispatch domctl to the appropriate paging code */
-    if ( opt_hap_enabled && is_hvm_domain(d) )
+    if ( hap_enabled(d) )
 	return hap_domctl(d, sc, u_domctl);
     else
 	return shadow_domctl(d, sc, u_domctl);
@@ -438,7 +434,7 @@ int paging_domctl(struct domain *d, xen_
 /* Call when destroying a domain */
 void paging_teardown(struct domain *d)
 {
-    if ( opt_hap_enabled && is_hvm_domain(d) )
+    if ( hap_enabled(d) )
         hap_teardown(d);
     else
         shadow_teardown(d);
@@ -450,7 +446,7 @@ void paging_teardown(struct domain *d)
 /* Call once all of the references to the domain have gone away */
 void paging_final_teardown(struct domain *d)
 {
-    if ( opt_hap_enabled && is_hvm_domain(d) )
+    if ( hap_enabled(d) )
         hap_final_teardown(d);
     else
         shadow_final_teardown(d);
@@ -460,7 +456,7 @@ void paging_final_teardown(struct domain
  * creation. */
 int paging_enable(struct domain *d, u32 mode)
 {
-    if ( opt_hap_enabled && is_hvm_domain(d) )
+    if ( hap_enabled(d) )
         return hap_enable(d, mode | PG_HAP_enable);
     else
         return shadow_enable(d, mode | PG_SH_enable);
diff -Naurp xen/arch/x86/mm/shadow/common.c xen-redhat/arch/x86/mm/shadow/common.c
--- xen/arch/x86/mm/shadow/common.c
+++ xen-redhat/arch/x86/mm/shadow/common.c
@@ -101,7 +101,7 @@ int _shadow_mode_refcounts(struct domain
 /* x86 emulator support for the shadow code
  */
 
-struct segment_register *hvm_get_seg_reg(
+static struct segment_register *hvm_get_seg_reg(
     enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
 {
     struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
@@ -110,10 +110,6 @@ struct segment_register *hvm_get_seg_reg
     return seg_reg;
 }
 
-enum hvm_access_type {
-    hvm_access_insn_fetch, hvm_access_read, hvm_access_write
-};
-
 static int hvm_translate_linear_addr(
     enum x86_segment seg,
     unsigned long offset,
@@ -123,76 +119,18 @@ static int hvm_translate_linear_addr(
     unsigned long *paddr)
 {
     struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
-    unsigned long limit, addr = offset;
-    uint32_t last_byte;
+    int okay;
 
-    if ( sh_ctxt->ctxt.addr_size != 64 )
-    {
-        /*
-         * COMPATIBILITY MODE: Apply segment checks and add base.
-         */
+    okay = hvm_virtual_to_linear_addr(
+        seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
 
-        switch ( access_type )
-        {
-        case hvm_access_read:
-            if ( (reg->attr.fields.type & 0xa) == 0x8 )
-                goto gpf; /* execute-only code segment */
-            break;
-        case hvm_access_write:
-            if ( (reg->attr.fields.type & 0xa) != 0x2 )
-                goto gpf; /* not a writable data segment */
-            break;
-        default:
-            break;
-        }
-
-        /* Calculate the segment limit, including granularity flag. */
-        limit = reg->limit;
-        if ( reg->attr.fields.g )
-            limit = (limit << 12) | 0xfff;
-
-        last_byte = offset + bytes - 1;
-
-        /* Is this a grows-down data segment? Special limit check if so. */
-        if ( (reg->attr.fields.type & 0xc) == 0x4 )
-        {
-            /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
-            if ( !reg->attr.fields.db )
-                last_byte = (uint16_t)last_byte;
-
-            /* Check first byte and last byte against respective bounds. */
-            if ( (offset <= limit) || (last_byte < offset) )
-                goto gpf;
-        }
-        else if ( (last_byte > limit) || (last_byte < offset) )
-            goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
-
-        /*
-         * Hardware truncates to 32 bits in compatibility mode.
-         * It does not truncate to 16 bits in 16-bit address-size mode.
-         */
-        addr = (uint32_t)(addr + reg->base);
-    }
-    else
+    if ( !okay )
     {
-        /*
-         * LONG MODE: FS and GS add segment base. Addresses must be canonical.
-         */
-
-        if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
-            addr += reg->base;
-
-        if ( !is_canonical_address(addr) )
-            goto gpf;
+        hvm_inject_exception(TRAP_gp_fault, 0, 0);
+        return X86EMUL_EXCEPTION;
     }
 
-    *paddr = addr;
-    return 0;    
-
- gpf:
-    /* Inject #GP(0). */
-    hvm_inject_exception(TRAP_gp_fault, 0, 0);
-    return X86EMUL_EXCEPTION;
+    return 0;
 }
 
 static int
@@ -771,14 +709,29 @@ shadow_order(unsigned int shadow_type) 
 }
 
 
-/* Do we have a free chunk of at least this order? */
-static inline int chunk_is_available(struct domain *d, int order)
+static inline unsigned int
+shadow_max_order(struct domain *d)
 {
-    int i;
-    
-    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
-        if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
-            return 1;
+    return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
+}
+
+/* Do we have at total of count pages of the requested order free? */
+static inline int space_is_available(
+    struct domain *d,
+    unsigned int order,
+    unsigned int count)
+{
+    for ( ; order <= shadow_max_order(d); ++order )
+    {
+        unsigned int n = count;
+        const struct list_head *p;
+
+        list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
+            if ( --n == 0 )
+                return 1;
+        count = (count + 1) >> 1;
+    }
+
     return 0;
 }
 
@@ -814,12 +767,12 @@ static void shadow_unhook_mappings(struc
 }
 
 
-/* Make sure there is at least one chunk of the required order available
- * in the shadow page pool. This must be called before any calls to
- * shadow_alloc().  Since this will free existing shadows to make room,
- * it must be called early enough to avoid freeing shadows that the
- * caller is currently working on. */
-void shadow_prealloc(struct domain *d, unsigned int order)
+/* Make sure there are at least count order-sized pages
+ * available in the shadow page pool. */
+static void _shadow_prealloc(
+    struct domain *d,
+    unsigned int order,
+    unsigned int count)
 {
     /* Need a vpcu for calling unpins; for now, since we don't have
      * per-vcpu shadows, any will do */
@@ -830,7 +783,8 @@ void shadow_prealloc(struct domain *d, u
     mfn_t smfn;
     int i;
 
-    if ( chunk_is_available(d, order) ) return; 
+    ASSERT(order <= shadow_max_order(d));
+    if ( space_is_available(d, order, count) ) return;
     
     v = current;
     if ( v->domain != d )
@@ -847,8 +801,8 @@ void shadow_prealloc(struct domain *d, u
         /* Unpin this top-level shadow */
         sh_unpin(v, smfn);
 
-        /* See if that freed up a chunk of appropriate size */
-        if ( chunk_is_available(d, order) ) return;
+        /* See if that freed up enough space */
+        if ( space_is_available(d, order, count) ) return;
     }
 
     /* Stage two: all shadow pages are in use in hierarchies that are
@@ -865,8 +819,8 @@ void shadow_prealloc(struct domain *d, u
                                pagetable_get_mfn(v2->arch.shadow_table[i]));
                 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
 
-                /* See if that freed up a chunk of appropriate size */
-                if ( chunk_is_available(d, order) ) 
+                /* See if that freed up enough space */
+                if ( space_is_available(d, order, count) )
                 {
                     flush_tlb_mask(flushmask);
                     return;
@@ -876,15 +830,26 @@ void shadow_prealloc(struct domain *d, u
     
     /* Nothing more we can do: all remaining shadows are of pages that
      * hold Xen mappings for some vcpu.  This can never happen. */
-    SHADOW_ERROR("Can't pre-allocate %i shadow pages!\n"
+    SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
                  "  shadow pages total = %u, free = %u, p2m=%u\n",
-                 1 << order,
+                 count, order,
                  d->arch.paging.shadow.total_pages,
                  d->arch.paging.shadow.free_pages,
                  d->arch.paging.shadow.p2m_pages);
     BUG();
 }
 
+/* Make sure there are at least count pages of the order according to
+ * type available in the shadow page pool.
+ * This must be called before any calls to shadow_alloc().  Since this
+ * will free existing shadows to make room, it must be called early enough
+ * to avoid freeing shadows that the caller is currently working on. */
+void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
+{
+    ASSERT(type != SH_type_p2m_table);
+    return _shadow_prealloc(d, shadow_order(type), count);
+}
+
 /* Deliberately free all the memory we can: this will tear down all of
  * this domain's shadows */
 static void shadow_blow_tables(struct domain *d) 
@@ -961,7 +926,9 @@ mfn_t shadow_alloc(struct domain *d,  
     int i;
 
     ASSERT(shadow_locked_by_me(d));
-    ASSERT(order <= SHADOW_MAX_ORDER);
+    if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
+        order = shadow_max_order(d);
+    ASSERT(order <= shadow_max_order(d));
     ASSERT(shadow_type != SH_type_none);
     perfc_incr(shadow_alloc);
 
@@ -1062,7 +1029,7 @@ void shadow_free(struct domain *d, mfn_t
     }
 
     /* Merge chunks as far as possible. */
-    while ( order < SHADOW_MAX_ORDER )
+    for ( ; order < shadow_max_order(d); ++order )
     {
         mask = 1 << order;
         if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
@@ -1077,7 +1044,6 @@ void shadow_free(struct domain *d, mfn_t
                 break;
             list_del(&(sp+mask)->list);
         }
-        order++;
     }
 
     sp->order = order;
@@ -1099,16 +1065,18 @@ sh_alloc_p2m_pages(struct domain *d)
 {
     struct page_info *pg;
     u32 i;
+    unsigned int order = shadow_max_order(d);
+
     ASSERT(shadow_locked_by_me(d));
     
     if ( d->arch.paging.shadow.total_pages 
-         < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
+         < (shadow_min_acceptable_pages(d) + (1 << order)) )
         return 0; /* Not enough shadow memory: need to increase it first */
     
     pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
-    d->arch.paging.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
-    d->arch.paging.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
-    for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
+    d->arch.paging.shadow.p2m_pages += (1 << order);
+    d->arch.paging.shadow.total_pages -= (1 << order);
+    for (i = 0; i < (1U << order); i++)
     {
         /* Unlike shadow pages, mark p2m pages as owned by the domain.
          * Marking the domain as the owner would normally allow the guest to
@@ -1228,7 +1196,7 @@ static unsigned int sh_set_allocation(st
 {
     struct shadow_page_info *sp;
     unsigned int lower_bound;
-    int j;
+    unsigned int j, order = shadow_max_order(d);
 
     ASSERT(shadow_locked_by_me(d));
     
@@ -1249,15 +1217,15 @@ static unsigned int sh_set_allocation(st
         {
             /* Need to allocate more memory from domheap */
             sp = (struct shadow_page_info *)
-                alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); 
+                alloc_domheap_pages(NULL, order, 0);
             if ( sp == NULL ) 
             { 
                 SHADOW_PRINTK("failed to allocate shadow pages.\n");
                 return -ENOMEM;
             }
-            d->arch.paging.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
-            d->arch.paging.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
-            for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ ) 
+            d->arch.paging.shadow.free_pages += 1 << order;
+            d->arch.paging.shadow.total_pages += 1 << order;
+            for ( j = 0; j < 1U << order; j++ )
             {
                 sp[j].type = 0;  
                 sp[j].pinned = 0;
@@ -1265,21 +1233,20 @@ static unsigned int sh_set_allocation(st
                 sp[j].mbz = 0;
                 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
             }
-            sp->order = SHADOW_MAX_ORDER;
-            list_add_tail(&sp->list, 
-                          &d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]);
+            sp->order = order;
+            list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
         } 
         else if ( d->arch.paging.shadow.total_pages > pages ) 
         {
             /* Need to return memory to domheap */
-            shadow_prealloc(d, SHADOW_MAX_ORDER);
-            ASSERT(!list_empty(&d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]));
-            sp = list_entry(d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER].next, 
+            _shadow_prealloc(d, order, 1);
+            ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
+            sp = list_entry(d->arch.paging.shadow.freelists[order].next,
                             struct shadow_page_info, list);
             list_del(&sp->list);
-            d->arch.paging.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
-            d->arch.paging.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
-            free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
+            d->arch.paging.shadow.free_pages -= 1 << order;
+            d->arch.paging.shadow.total_pages -= 1 << order;
+            free_domheap_pages((struct page_info *)sp, order);
         }
 
         /* Check to see if we need to yield and try again */
diff -Naurp xen/arch/x86/mm/shadow/multi.c xen-redhat/arch/x86/mm/shadow/multi.c
--- xen/arch/x86/mm/shadow/multi.c
+++ xen-redhat/arch/x86/mm/shadow/multi.c
@@ -35,6 +35,7 @@
 #include <asm/hvm/hvm.h>
 #include "private.h"
 #include "types.h"
+#include <xen/iocap.h>
 
 /* THINGS TO DO LATER:
  * 
@@ -654,7 +655,8 @@ _sh_propagate(struct vcpu *v, 
         goto done;
     }
 
-    if ( level == 1 && mmio )
+    if ( level == 1 && mmio &&
+         !iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) )
     {
         /* Guest l1e maps MMIO space */
         *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
@@ -667,7 +669,8 @@ _sh_propagate(struct vcpu *v, 
     // case of a prefetch, an invalid mfn means that we can not usefully
     // shadow anything, and so we return early.
     //
-    if ( !mfn_valid(target_mfn) )
+    if ( !mfn_valid(target_mfn) &&
+         !iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) )
     {
         ASSERT((ft == ft_prefetch));
         *sp = shadow_l1e_empty();
@@ -750,6 +753,10 @@ _sh_propagate(struct vcpu *v, 
         sflags |= _PAGE_USER;
     }
 
+    /* MMIO addresses should never be cached */
+    if ( iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) )
+        sflags |= _PAGE_PCD;
+
     *sp = shadow_l1e_from_mfn(target_mfn, sflags);
  done:
     SHADOW_DEBUG(PROPAGATE,
@@ -1661,7 +1668,7 @@ sh_make_monitor_table(struct vcpu *v)
     ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
     
     /* Guarantee we can get the memory we need */
-    shadow_prealloc(d, SHADOW_MAX_ORDER);
+    shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS - 1);
 
 #if CONFIG_PAGING_LEVELS == 4    
     {
@@ -2815,10 +2822,13 @@ static int sh_page_fault(struct vcpu *v,
     }
 
     /* Make sure there is enough free shadow memory to build a chain of
-     * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
-     * to allocate all we need.  (We never allocate a top-level shadow
-     * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
-    shadow_prealloc(d, SHADOW_MAX_ORDER);
+     * shadow tables. (We never allocate a top-level shadow on this path,
+     * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
+     * SH_type_l1_shadow isn't correct in the latter case, all page
+     * tables are the same size there.) */
+    shadow_prealloc(d,
+                    SH_type_l1_shadow,
+                    GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
 
     /* Acquire the shadow.  This must happen before we figure out the rights 
      * for the shadow entry, since we might promote a page here. */
@@ -2905,7 +2915,7 @@ static int sh_page_fault(struct vcpu *v,
          * stack is currently considered to be a page table, so we should
          * unshadow the faulting page before exiting.
          */
-        if ( unlikely(hvm_event_injection_faulted(v)) )
+        if ( unlikely(hvm_event_pending(v)) )
         {
             gdprintk(XENLOG_DEBUG, "write to pagetable during event "
                      "injection: cr2=%#lx, mfn=%#lx\n", 
@@ -3439,7 +3449,7 @@ sh_set_toplevel_shadow(struct vcpu *v, 
     if ( !mfn_valid(smfn) )
     {
         /* Make sure there's enough free shadow memory. */
-        shadow_prealloc(d, SHADOW_MAX_ORDER); 
+        shadow_prealloc(d, root_type, 1);
         /* Shadow the page. */
         smfn = sh_make_shadow(v, gmfn, root_type);
     }
@@ -4012,7 +4022,8 @@ static inline void * emulate_map_dest(st
     if ( !(flags & _PAGE_RW) ) 
         goto page_fault;
 
-    if ( mfn_valid(mfn) )
+    if ( mfn_valid(mfn) &&
+         (mfn_x(mfn) != v->domain->arch.hvm_domain.vmx_apic_access_mfn) )
     {
         *mfnp = mfn;
         v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
diff -Naurp xen/arch/x86/mm/shadow/private.h xen-redhat/arch/x86/mm/shadow/private.h
--- xen/arch/x86/mm/shadow/private.h
+++ xen-redhat/arch/x86/mm/shadow/private.h
@@ -243,17 +243,22 @@ struct shadow_page_info
         /* For non-pinnable shadows, a higher entry that points at us */
         paddr_t up;
     };
+#if NR_CPUS > 64
+    /* Need to add some padding to match struct page_info size,
+    * if cpumask_t is larger than a long
+    */
+    u8 padding[sizeof(cpumask_t)-sizeof(long)];
+#endif
 };
 
-/* The structure above *must* be the same size as a struct page_info
+/* The structure above *must* be no larger than a struct page_info
  * from mm.h, since we'll be using the same space in the frametable. 
  * Also, the mbz field must line up with the owner field of normal 
  * pages, so they look properly like anonymous/xen pages. */
 static inline void shadow_check_page_struct_offsets(void) {
-    BUILD_BUG_ON(sizeof (struct shadow_page_info) 
-                 != sizeof (struct page_info));
-    BUILD_BUG_ON(offsetof(struct shadow_page_info, mbz) 
-                 != offsetof(struct page_info, u.inuse._domain));
+    BUILD_BUG_ON(sizeof (struct shadow_page_info) > sizeof (struct page_info));
+    BUILD_BUG_ON(offsetof(struct shadow_page_info, mbz) !=
+                 offsetof(struct page_info, u.inuse._domain));
 };
 
 /* Shadow type codes */
@@ -354,7 +359,7 @@ void shadow_promote(struct vcpu *v, mfn_
 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type);
 
 /* Shadow page allocation functions */
-void  shadow_prealloc(struct domain *d, unsigned int order);
+void  shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
 mfn_t shadow_alloc(struct domain *d, 
                     u32 shadow_type,
                     unsigned long backpointer);
diff -Naurp xen/arch/x86/mm.c xen-redhat/arch/x86/mm.c
--- xen/arch/x86/mm.c
+++ xen-redhat/arch/x86/mm.c
@@ -140,7 +140,7 @@ static DEFINE_PER_CPU(struct percpu_mm_i
 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
 
 /* Private domain structs for DOMID_XEN and DOMID_IO. */
-static struct domain *dom_xen, *dom_io;
+struct domain *dom_xen, *dom_io;
 
 /* Frame table and its size in pages. */
 struct page_info *frame_table;
@@ -465,11 +465,11 @@ static int alloc_segdesc_page(struct pag
             goto fail;
 
     unmap_domain_page(descs);
-    return 1;
+    return 0;
 
  fail:
     unmap_domain_page(descs);
-    return 0;
+    return -EINVAL;
 }
 
 
@@ -523,20 +523,25 @@ static int get_page_from_pagenr(unsigned
 
 static int get_page_and_type_from_pagenr(unsigned long page_nr, 
                                          unsigned long type,
-                                         struct domain *d)
+                                         struct domain *d,
+                                         int partial,
+                                         int preemptible)
 {
     struct page_info *page = mfn_to_page(page_nr);
+    int rc;
 
-    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
-        return 0;
+    if ( likely(partial >= 0) &&
+         unlikely(!get_page_from_pagenr(page_nr, d)) )
+        return -EINVAL;
 
-    if ( unlikely(!get_page_type(page, type)) )
-    {
+    rc = (preemptible ?
+          get_page_type_preemptible(page, type) :
+          (get_page_type(page, type) ? 0 : -EINVAL));
+
+    if ( unlikely(rc) && partial >= 0 )
         put_page(page);
-        return 0;
-    }
 
-    return 1;
+    return rc;
 }
 
 /*
@@ -667,12 +672,13 @@ get_page_from_l2e(
     if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
     {
         MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
-        return 0;
+        return -EINVAL;
     }
 
-    rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
-    if ( unlikely(!rc) )
-        rc = get_l2_linear_pagetable(l2e, pfn, d);
+    rc = get_page_and_type_from_pagenr(
+	l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
+    if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+        rc = 0;
 
     return rc;
 }
@@ -682,7 +688,7 @@ get_page_from_l2e(
 define_get_linear_pagetable(l3);
 static int
 get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
 {
     int rc;
 
@@ -692,12 +698,13 @@ get_page_from_l3e(
     if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
     {
         MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
-        return 0;
+        return -EINVAL;
     }
 
-    rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
-    if ( unlikely(!rc) )
-        rc = get_l3_linear_pagetable(l3e, pfn, d);
+    rc = get_page_and_type_from_pagenr(
+        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
+    if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
+        rc = 0;
 
     return rc;
 }
@@ -707,7 +714,7 @@ get_page_from_l3e(
 define_get_linear_pagetable(l4);
 static int
 get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
 {
     int rc;
 
@@ -717,12 +724,13 @@ get_page_from_l4e(
     if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
     {
         MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
-        return 0;
+        return -EINVAL;
     }
 
-    rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
-    if ( unlikely(!rc) )
-        rc = get_l4_linear_pagetable(l4e, pfn, d);
+    rc = get_page_and_type_from_pagenr(
+        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
+    if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
+        rc = 0;
 
     return rc;
 }
@@ -857,29 +865,47 @@ void put_page_from_l1e(l1_pgentry_t l1e,
  * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  * Note also that this automatically deals correctly with linear p.t.'s.
  */
-static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
 {
-    if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
-         (l2e_get_pfn(l2e) != pfn) )
+    if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
+	 (l2e_get_pfn(l2e) != pfn) )
+    {
         put_page_and_type(l2e_get_page(l2e));
+	return 0;
+    }
+    return 1;
 }
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
+static int __put_page_type(struct page_info *, int preemptible);
+
+static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+                             int partial, int preemptible)
 {
-    if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
-         (l3e_get_pfn(l3e) != pfn) )
-        put_page_and_type(l3e_get_page(l3e));
+    if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
+	 (l3e_get_pfn(l3e) != pfn) )
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l3e_get_page(l3e), preemptible);
+        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+    }
+    return 1;
 }
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 4
-static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
+static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+                              int partial, int preemptible)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
-        put_page_and_type(l4e_get_page(l4e));
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l4e_get_page(l4e), preemptible);
+        return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+    }
+    return 1;
 }
 #endif
 
@@ -888,7 +914,7 @@ static int alloc_l1_table(struct page_in
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l1_pgentry_t  *pl1e;
-    int            i;
+    unsigned int   i;
 
     pl1e = map_domain_page(pfn);
 
@@ -902,7 +928,7 @@ static int alloc_l1_table(struct page_in
     }
 
     unmap_domain_page(pl1e);
-    return 1;
+    return 0;
 
  fail:
     MEM_LOG("Failure in alloc_l1_table: entry %d", i);
@@ -911,7 +937,7 @@ static int alloc_l1_table(struct page_in
             put_page_from_l1e(pl1e[i], d);
 
     unmap_domain_page(pl1e);
-    return 0;
+    return -EINVAL;
 }
 
 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
@@ -1043,61 +1069,57 @@ static void pae_flush_pgd(
 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
 #endif
 
-static int alloc_l2_table(struct page_info *page, unsigned long type)
+static int alloc_l2_table(struct page_info *page, unsigned long type,
+                          int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l2_pgentry_t  *pl2e;
-    int            i;
+    unsigned int   i;
+    int            rc = 0;
 
     pl2e = map_domain_page(pfn);
 
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
     {
-        if ( is_guest_l2_slot(d, type, i) &&
-             unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
-            goto fail;
-        
-        adjust_guest_l2e(pl2e[i], d);
-    }
+        if ( preemptible && i && hypercall_preempt_check() )
+        {
+            page->nr_validated_ptes = i;
+            rc = -EAGAIN;
+            break;
+        }
 
-#if CONFIG_PAGING_LEVELS == 2
-    /* Xen private mappings. */
-    memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
-           &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
-           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
-    pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-        l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
-    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
-        pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-            l2e_from_page(
-                virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
-                __PAGE_HYPERVISOR);
-#endif
+        if ( !is_guest_l2_slot(d, type, i) ||
+             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
+            continue;
 
-    unmap_domain_page(pl2e);
-    return 1;
+        if ( rc < 0 )
+        {
+            MEM_LOG("Failure in alloc_l2_table: entry %d", i);
+            while ( i-- > 0 )
+                if ( is_guest_l2_slot(d, type, i) )
+                    put_page_from_l2e(pl2e[i], pfn);
+            break;
+        }
 
- fail:
-    MEM_LOG("Failure in alloc_l2_table: entry %d", i);
-    while ( i-- > 0 )
-        if ( is_guest_l2_slot(d, type, i) )
-            put_page_from_l2e(pl2e[i], pfn);
+        adjust_guest_l2e(pl2e[i], d);
+    }
 
     unmap_domain_page(pl2e);
-    return 0;
+    return rc > 0 ? 0 : rc;
 }
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-static int alloc_l3_table(struct page_info *page)
+static int alloc_l3_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l3_pgentry_t  *pl3e;
-    int            i;
+    unsigned int   i;
+    int            rc = 0, partial = page->partial_pte;
 
-#ifdef CONFIG_X86_PAE
+#if CONFIG_PAGING_LEVELS == 3
     /*
      * PAE pgdirs above 4GB are unacceptable if the guest does not understand
      * the weird 'extended cr3' format for dealing with high-order address
@@ -1108,7 +1130,7 @@ static int alloc_l3_table(struct page_in
          d->vcpu[0] && d->vcpu[0]->is_initialised )
     {
         MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
-        return 0;
+        return -EINVAL;
     }
 #endif
 
@@ -1124,60 +1146,103 @@ static int alloc_l3_table(struct page_in
     if ( is_pv_32on64_domain(d) )
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
-    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
-#if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
         if ( is_pv_32bit_domain(d) && (i == 3) )
         {
             if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
-                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
-                 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
-                                                PGT_l2_page_table |
-                                                PGT_pae_xen_l2,
-                                                d) )
-                goto fail;
+                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+                rc = -EINVAL;
+            else
+                rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
+                                                   PGT_l2_page_table |
+                                                   PGT_pae_xen_l2,
+                                                   d, partial, preemptible);
+        }
+        else if ( !is_guest_l3_slot(i) ||
+                  (rc = get_page_from_l3e(pl3e[i], pfn, d,
+                                          partial, preemptible)) > 0 )
+            continue;
+
+        if ( rc == -EAGAIN )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = partial ?: 1;
         }
-        else
-#endif
-        if ( is_guest_l3_slot(i) &&
-             unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
-            goto fail;
-        
+        else if ( rc == -EINTR && i )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 0;
+            rc = -EAGAIN;
+        }
+        if ( rc < 0 )
+            break;
+
         adjust_guest_l3e(pl3e[i], d);
     }
 
-    if ( !create_pae_xen_mappings(d, pl3e) )
-        goto fail;
-
-    unmap_domain_page(pl3e);
-    return 1;
-
- fail:
-    MEM_LOG("Failure in alloc_l3_table: entry %d", i);
-    while ( i-- > 0 )
-        if ( is_guest_l3_slot(i) )
-            put_page_from_l3e(pl3e[i], pfn);
+    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
+        rc = -EINVAL;
+    if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
+    {
+        MEM_LOG("Failure in alloc_l3_table: entry %d", i);
+        while ( i-- > 0 )
+        {
+            if ( !is_guest_l3_slot(i) )
+                continue;
+            unadjust_guest_l3e(pl3e[i], d);
+            put_page_from_l3e(pl3e[i], pfn, 0, 0);
+        }
+    }
 
     unmap_domain_page(pl3e);
-    return 0;
+    return rc > 0 ? 0 : rc;
 }
 #else
-#define alloc_l3_table(page) (0)
+#define alloc_l3_table(page, preemptible) (-EINVAL)
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 4
-static int alloc_l4_table(struct page_info *page)
+static int alloc_l4_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l4_pgentry_t  *pl4e = page_to_virt(page);
-    int            i;
+    unsigned int   i;
+    int            rc = 0, partial = page->partial_pte;
 
-    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
-        if ( is_guest_l4_slot(d, i) &&
-             unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
-            goto fail;
+        if ( !is_guest_l4_slot(d, i) ||
+             (rc = get_page_from_l4e(pl4e[i], pfn, d,
+                                     partial, preemptible)) > 0 )
+            continue;
+
+        if ( rc == -EAGAIN )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = partial ?: 1;
+        }
+        else if ( rc == -EINTR )
+        {
+            if ( i )
+            {
+                page->nr_validated_ptes = i;
+                page->partial_pte = 0;
+                rc = -EAGAIN;
+            }
+        }
+        else if ( rc < 0 )
+        {
+            MEM_LOG("Failure in alloc_l4_table: entry %d", i);
+            while ( i-- > 0 )
+                if ( is_guest_l4_slot(d, i) )
+                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
+        }
+        if ( rc < 0 )
+            return rc;
 
         adjust_guest_l4e(pl4e[i], d);
     }
@@ -1191,23 +1256,11 @@ static int alloc_l4_table(struct page_in
     pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
         l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
                       __PAGE_HYPERVISOR);
-    if ( is_pv_32on64_domain(d) )
-        pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
-            l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
-                          __PAGE_HYPERVISOR);
-
-    return 1;
-
- fail:
-    MEM_LOG("Failure in alloc_l4_table: entry %d", i);
-    while ( i-- > 0 )
-        if ( is_guest_l4_slot(d, i) )
-            put_page_from_l4e(pl4e[i], pfn);
 
-    return 0;
+    return rc > 0 ? 0 : rc;
 }
 #else
-#define alloc_l4_table(page) (0)
+#define alloc_l4_table(page, preemptible) (-EINVAL)
 #endif
 
 
@@ -1216,7 +1269,7 @@ static void free_l1_table(struct page_in
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l1_pgentry_t *pl1e;
-    int i;
+    unsigned int  i;
 
     pl1e = map_domain_page(pfn);
 
@@ -1228,64 +1281,113 @@ static void free_l1_table(struct page_in
 }
 
 
-static void free_l2_table(struct page_info *page)
+static int free_l2_table(struct page_info *page, int preemptible)
 {
 #ifdef CONFIG_COMPAT
     struct domain *d = page_get_owner(page);
 #endif
     unsigned long pfn = page_to_mfn(page);
     l2_pgentry_t *pl2e;
-    int i;
+    unsigned int  i = page->nr_validated_ptes - 1;
+    int err = 0;
 
     pl2e = map_domain_page(pfn);
 
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
-            put_page_from_l2e(pl2e[i], pfn);
+    ASSERT(page->nr_validated_ptes);
+    do {
+        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+             put_page_from_l2e(pl2e[i], pfn) == 0 &&
+             preemptible && i && hypercall_preempt_check() )
+        {
+           page->nr_validated_ptes = i;
+           err = -EAGAIN;
+        }
+    } while ( !err && i-- );
 
     unmap_domain_page(pl2e);
 
-    page->u.inuse.type_info &= ~PGT_pae_xen_l2;
+    if ( !err )
+        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
+
+    return err;
 }
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-
-static void free_l3_table(struct page_info *page)
+static int free_l3_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l3_pgentry_t *pl3e;
-    int           i;
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     pl3e = map_domain_page(pfn);
 
-    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+    do {
         if ( is_guest_l3_slot(i) )
         {
-            put_page_from_l3e(pl3e[i], pfn);
+            rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
+            if ( rc < 0 )
+                break;
+            partial = 0;
+            if ( rc > 0 )
+                continue;
             unadjust_guest_l3e(pl3e[i], d);
         }
+    } while ( i-- );
 
     unmap_domain_page(pl3e);
-}
 
+    if ( rc == -EAGAIN )
+    {
+        page->nr_validated_ptes = i;
+        page->partial_pte = partial ?: -1;
+    }
+    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+    {
+        page->nr_validated_ptes = i + 1;
+        page->partial_pte = 0;
+        rc = -EAGAIN;
+    }
+    return rc > 0 ? 0 : rc;
+}
+#else
+#define free_l3_table(page, preemptible) (-EINVAL)
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 4
-
-static void free_l4_table(struct page_info *page)
+static int free_l4_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l4_pgentry_t *pl4e = page_to_virt(page);
-    int           i;
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
-    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
+    do {
         if ( is_guest_l4_slot(d, i) )
-            put_page_from_l4e(pl4e[i], pfn);
-}
+            rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
+        if ( rc < 0 )
+            break;
+        partial = 0;
+    } while ( i-- );
 
+    if ( rc == -EAGAIN )
+    {
+        page->nr_validated_ptes = i;
+        page->partial_pte = partial ?: -1;
+    }
+    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+    {
+        page->nr_validated_ptes = i + 1;
+        page->partial_pte = 0;
+        rc = -EAGAIN;
+    }
+    return rc > 0 ? 0 : rc;
+}
+#else
+#define free_l4_table(page, preemptible) (-EINVAL)
 #endif
 
 
@@ -1295,16 +1397,24 @@ static inline int update_intpte(intpte_t
                                 intpte_t old, 
                                 intpte_t new,
                                 unsigned long mfn,
-                                struct vcpu *v)
+                                struct vcpu *v,
+                                int preserve_ad)
 {
     int rv = 1;
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-    rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
-#else
+    if ( !preserve_ad )
+    {
+        rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
+    }
+    else
+#endif
     {
         intpte_t t = old;
         for ( ; ; )
         {
+            if ( preserve_ad )
+                new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
+
             rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
             if ( unlikely(rv == 0) )
             {
@@ -1322,20 +1432,19 @@ static inline int update_intpte(intpte_t
             old = t;
         }
     }
-#endif
     return rv;
 }
 
 /* Macro that wraps the appropriate type-changes around update_intpte().
  * Arguments are: type, ptr, old, new, mfn, vcpu */
-#define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v)                             \
+#define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad)                         \
     update_intpte(&_t ## e_get_intpte(*(_p)),                       \
                   _t ## e_get_intpte(_o), _t ## e_get_intpte(_n),   \
-                  (_m), (_v))
+                  (_m), (_v), (_ad))
 
 /* Update the L1 entry at pl1e to new value nl1e. */
 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, 
-                        unsigned long gl1mfn)
+                        unsigned long gl1mfn, int preserve_ad)
 {
     l1_pgentry_t ol1e;
     struct domain *d = current->domain;
@@ -1345,7 +1454,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
         return 0;
 
     if ( unlikely(paging_mode_refcounts(d)) )
-        return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
+        return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current, preserve_ad);
 
     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
     {
@@ -1363,16 +1472,20 @@ static int mod_l1_entry(l1_pgentry_t *pl
             return 0;
         }
 
-        adjust_guest_l1e(nl1e, d);
-
         /* Fast path for identical mapping, r/w and presence. */
         if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
-            return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
+	{
+            adjust_guest_l1e(nl1e, d);
+            return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current,
+                                preserve_ad);
+	}
 
         if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
             return 0;
-        
-        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
+
+        adjust_guest_l1e(nl1e, d);
+        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current,
+                                    preserve_ad)) )
         {
             put_page_from_l1e(nl1e, d);
             return 0;
@@ -1380,7 +1493,8 @@ static int mod_l1_entry(l1_pgentry_t *pl
     }
     else
     {
-        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
+        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current,
+                                    preserve_ad)) )
             return 0;
     }
 
@@ -1393,7 +1507,8 @@ static int mod_l1_entry(l1_pgentry_t *pl
 static int mod_l2_entry(l2_pgentry_t *pl2e, 
                         l2_pgentry_t nl2e, 
                         unsigned long pfn,
-                        unsigned long type)
+                        unsigned long type,
+                        int preserve_ad)
 {
     l2_pgentry_t ol2e;
     struct domain *d = current->domain;
@@ -1416,22 +1531,27 @@ static int mod_l2_entry(l2_pgentry_t *pl
             return 0;
         }
 
-        adjust_guest_l2e(nl2e, d);
-
         /* Fast path for identical mapping and presence. */
         if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current);
+        {
+            adjust_guest_l2e(nl2e, d);
+            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current, preserve_ad);
+        }
 
-        if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
+        if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
+        adjust_guest_l2e(nl2e, d);
+
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current,
+                                    preserve_ad)) )
         {
             put_page_from_l2e(nl2e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
+    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current,
+                                     preserve_ad)) )
     {
         return 0;
     }
@@ -1445,16 +1565,18 @@ static int mod_l2_entry(l2_pgentry_t *pl
 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
 static int mod_l3_entry(l3_pgentry_t *pl3e, 
                         l3_pgentry_t nl3e, 
-                        unsigned long pfn)
+                        unsigned long pfn,
+                        int preserve_ad,
+                        int preemptible)
 {
     l3_pgentry_t ol3e;
     struct domain *d = current->domain;
-    int okay;
+    int rc = 0;
 
     if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
     {
         MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
-        return 0;
+        return -EINVAL;
     }
 
 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
@@ -1463,11 +1585,11 @@ static int mod_l3_entry(l3_pgentry_t *pl
      * would be a pain to ensure they remain continuously valid throughout.
      */
     if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
-        return 0;
+        return -EINVAL;
 #endif 
 
     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
-        return 0;
+        return -EFAULT;
 
     if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
     {
@@ -1475,36 +1597,46 @@ static int mod_l3_entry(l3_pgentry_t *pl
         {
             MEM_LOG("Bad L3 flags %x",
                     l3e_get_flags(nl3e) & l3_disallow_mask(d));
-            return 0;
+            return -EINVAL;
         }
 
-        adjust_guest_l3e(nl3e, d);
-
         /* Fast path for identical mapping and presence. */
         if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current);
+        {
+            adjust_guest_l3e(nl3e, d);
+            rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current, preserve_ad);
+            return rc ? 0 : -EFAULT;
+        }
 
-        if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
-            return 0;
+        rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
+        if ( unlikely(rc < 0) )
+            return rc;
+        rc = 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
+        adjust_guest_l3e(nl3e, d);
+        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current,
+                                    preserve_ad)) )
         {
-            put_page_from_l3e(nl3e, pfn);
-            return 0;
+            ol3e = nl3e;
+            rc = -EFAULT;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
+    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current,
+                                     preserve_ad)) )
     {
-        return 0;
+        return -EFAULT;
     }
 
-    okay = create_pae_xen_mappings(d, pl3e);
-    BUG_ON(!okay);
+    if ( likely(rc == 0) )
+    {
+        if ( !create_pae_xen_mappings(d, pl3e) )
+            BUG();
 
-    pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
+        pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
+    }
 
-    put_page_from_l3e(ol3e, pfn);
-    return 1;
+    put_page_from_l3e(ol3e, pfn, 0, 0);
+    return rc;
 }
 
 #endif
@@ -1515,18 +1647,21 @@ static int mod_l3_entry(l3_pgentry_t *pl
 static int mod_l4_entry(struct domain *d,
                         l4_pgentry_t *pl4e, 
                         l4_pgentry_t nl4e, 
-                        unsigned long pfn)
+                        unsigned long pfn,
+                        int preserve_ad,
+                        int preemptible)
 {
     l4_pgentry_t ol4e;
+    int rc = 0;
 
     if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
     {
         MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
-        return 0;
+        return -EINVAL;
     }
 
     if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
-        return 0;
+        return -EFAULT;
 
     if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
     {
@@ -1534,38 +1669,69 @@ static int mod_l4_entry(struct domain *d
         {
             MEM_LOG("Bad L4 flags %x",
                     l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
-            return 0;
+            return -EINVAL;
         }
 
-        adjust_guest_l4e(nl4e, current->domain);
-
         /* Fast path for identical mapping and presence. */
         if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current);
+        {
+            adjust_guest_l4e(nl4e, current->domain);
+            rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current, preserve_ad);
+            return rc ? 0 : -EFAULT;
+        }
 
-        if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
-            return 0;
+        rc = get_page_from_l4e(nl4e, pfn, current->domain, 0, preemptible);
+        if ( unlikely(rc < 0) )
+            return rc;
+        rc = 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
+        adjust_guest_l4e(nl4e, current->domain);
+        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current,
+                                    preserve_ad)) )
         {
-            put_page_from_l4e(nl4e, pfn);
-            return 0;
+            ol4e = nl4e;
+            rc = -EFAULT;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
+    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current,
+                                     preserve_ad)) )
     {
-        return 0;
+        return -EFAULT;
     }
 
-    put_page_from_l4e(ol4e, pfn);
-    return 1;
+    put_page_from_l4e(ol4e, pfn, 0, 0);
+    return rc;
 }
 
 #endif
 
-int alloc_page_type(struct page_info *page, unsigned long type)
+/*
+ * Special version of get_page() to be used exclusively when
+ * - a page is known to already have a non-zero reference count
+ * - the page does not need its owner to be checked
+ * - it will not be called more than once without dropping the thus
+ *   acquired reference again.
+ * Due to get_page() reserving one reference, this call cannot fail.
+ */
+static void get_page_light(struct page_info *page)
+{
+    u32 x, nx, y = page->count_info;
+
+    do {
+        x  = y;
+        nx = x + 1;
+        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
+        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
+        y = cmpxchg(&page->count_info, x, nx);
+    }
+    while ( unlikely(y != x) );
+}
+
+int alloc_page_type(struct page_info *page, unsigned long type,
+                    int preemptible)
 {
     struct domain *owner = page_get_owner(page);
+    int rc;
 
     /* A page table is dirtied when its type count becomes non-zero. */
     if ( likely(owner != NULL) )
@@ -1574,31 +1740,66 @@ int alloc_page_type(struct page_info *pa
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
-        return alloc_l1_table(page);
+        rc = alloc_l1_table(page);
+        break;
     case PGT_l2_page_table:
-        return alloc_l2_table(page, type);
+        rc = alloc_l2_table(page, type, preemptible);
+        break;
     case PGT_l3_page_table:
-        return alloc_l3_table(page);
+        rc = alloc_l3_table(page, preemptible);
+        break;
     case PGT_l4_page_table:
-        return alloc_l4_table(page);
+        rc = alloc_l4_table(page, preemptible);
+        break;
     case PGT_gdt_page:
     case PGT_ldt_page:
-        return alloc_segdesc_page(page);
+        rc = alloc_segdesc_page(page);
+        break;
     default:
         printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 
                type, page->u.inuse.type_info,
                page->count_info);
+        rc = -EINVAL;
         BUG();
     }
 
-    return 0;
+    /* No need for atomic update of type_info here: noone else updates it. */
+    wmb();
+    if ( rc == -EAGAIN )
+    {
+        get_page_light(page);
+        page->u.inuse.type_info |= PGT_partial;
+    }
+    else if ( rc == -EINTR )
+    {
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+        page->u.inuse.type_info &= ~PGT_count_mask;
+    }
+    else if ( rc )
+    {
+        ASSERT(rc < 0);
+        MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
+                PRtype_info ": caf=%08x taf=%" PRtype_info,
+                page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
+                type, page->count_info, page->u.inuse.type_info);
+        page->u.inuse.type_info = 0;
+    }
+    else
+    {
+        page->u.inuse.type_info |= PGT_validated;
+    }
+
+    return rc;
 }
 
 
-void free_page_type(struct page_info *page, unsigned long type)
+int free_page_type(struct page_info *page, unsigned long type,
+                    int preemptible)
 {
     struct domain *owner = page_get_owner(page);
     unsigned long gmfn;
+    int rc;
 
     if ( likely(owner != NULL) )
     {
@@ -1618,7 +1819,7 @@ void free_page_type(struct page_info *pa
             paging_mark_dirty(owner, page_to_mfn(page));
 
             if ( shadow_mode_refcounts(owner) )
-                return;
+                return 0;
 
             gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
             ASSERT(VALID_M2P(gmfn));
@@ -1626,42 +1827,97 @@ void free_page_type(struct page_info *pa
         }
     }
 
+    if ( !(type & PGT_partial) )
+    {
+        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+        page->partial_pte = 0;
+    }
+
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
         free_l1_table(page);
+        rc = 0;
         break;
-
     case PGT_l2_page_table:
-        free_l2_table(page);
+        rc = free_l2_table(page, preemptible);
         break;
-
 #if CONFIG_PAGING_LEVELS >= 3
     case PGT_l3_page_table:
-        free_l3_table(page);
+#if CONFIG_PAGING_LEVELS == 3
+        if ( !(type & PGT_partial) )
+            page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
+#endif
+        rc = free_l3_table(page, preemptible);
         break;
 #endif
-
 #if CONFIG_PAGING_LEVELS >= 4
     case PGT_l4_page_table:
-        free_l4_table(page);
+        rc = free_l4_table(page, preemptible);
         break;
 #endif
 
     default:
-        printk("%s: type %lx pfn %lx\n",__FUNCTION__,
-               type, page_to_mfn(page));
+        MEM_LOG("type %lx pfn %lx\n",  type, page_to_mfn(page));
+        rc = -EINVAL;
         BUG();
     }
+
+    return rc;
 }
 
 
-void put_page_type(struct page_info *page)
+static int __put_final_page_type(
+    struct page_info *page, unsigned long type, int preemptible)
+{
+    int rc = free_page_type(page, type, preemptible);
+
+    /* No need for atomic update of type_info here: noone else updates it. */
+    if ( rc == 0 )
+    {
+        /*
+         * Record TLB information for flush later. We do not stamp page tables
+         * when running in shadow mode:
+         *  1. Pointless, since it's the shadow pt's which must be tracked.
+         *  2. Shadow mode reuses this field for shadowed page tables to
+         *     store flags info -- we don't want to conflict with that.
+         */
+        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+               (page->count_info & PGC_page_table)) )
+            page->tlbflush_timestamp = tlbflush_current_time();
+        wmb();
+        page->u.inuse.type_info--;
+    }
+    else if ( rc == -EINTR )
+    {
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+               (page->count_info & PGC_page_table)) )
+            page->tlbflush_timestamp = tlbflush_current_time();
+        wmb();
+        page->u.inuse.type_info |= PGT_validated;
+    }
+    else
+    {
+        BUG_ON(rc != -EAGAIN);
+        wmb();
+        get_page_light(page);
+        page->u.inuse.type_info |= PGT_partial;
+    }
+
+    return rc;
+}
+
+
+static int __put_page_type(struct page_info *page,
+                           int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
- again:
-    do {
+    for ( ; ; )
+    {
         x  = y;
         nx = x - 1;
 
@@ -1670,21 +1926,22 @@ void put_page_type(struct page_info *pag
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
             if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
-                 likely(nx & PGT_validated) )
+                 likely(nx & (PGT_validated|PGT_partial)) )
             {
                 /*
                  * Page-table pages must be unvalidated when count is zero. The
                  * 'free' is safe because the refcnt is non-zero and validated
                  * bit is clear => other ops will spin or fail.
                  */
-                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
-                                           x & ~PGT_validated)) != x) )
-                    goto again;
+                nx = x & ~(PGT_validated|PGT_partial);
+                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
+                                           x, nx)) != x) )
+                    continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                free_page_type(page, x);
-                /* Carry on, but with the 'valid bit' now clear. */
-                x  &= ~PGT_validated;
-                nx &= ~PGT_validated;
+                rc = __put_final_page_type(page, x, preemptible);
+                if ( x & PGT_partial )
+                    put_page(page);
+                break;
             }
 
             /*
@@ -1698,25 +1955,34 @@ void put_page_type(struct page_info *pag
                    (page->count_info & PGC_page_table)) )
                 page->tlbflush_timestamp = tlbflush_current_time();
         }
+
+        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+            break;
+
+        if ( preemptible && hypercall_preempt_check() )
+            return -EINTR;
     }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+
+    return rc;
 }
 
 
-int get_page_type(struct page_info *page, unsigned long type)
+static int __get_page_type(struct page_info *page, unsigned long type,
+                           int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
 
- again:
-    do {
+    for ( ; ; )
+    {
         x  = y;
         nx = x + 1;
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
             MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
-            return 0;
+            return -EINVAL;
         }
         else if ( unlikely((x & PGT_count_mask) == 0) )
         {
@@ -1763,50 +2029,85 @@ int get_page_type(struct page_info *page
             /* Don't log failure if it could be a recursive-mapping attempt. */
             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
                  (type == PGT_l1_page_table) )
-                return 0;
+                return -EINVAL;
             if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
                  (type == PGT_l2_page_table) )
-                return 0;
+                return -EINVAL;
             if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
                  (type == PGT_l3_page_table) )
-                return 0;
+                return -EINVAL;
             MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
                     "for mfn %lx (pfn %lx)",
                     x, type, page_to_mfn(page),
                     get_gpfn_from_mfn(page_to_mfn(page)));
-            return 0;
+            return -EINVAL;
         }
         else if ( unlikely(!(x & PGT_validated)) )
         {
-            /* Someone else is updating validation of this page. Wait... */
-            while ( (y = page->u.inuse.type_info) == x )
-                cpu_relax();
-            goto again;
+            if ( !(x & PGT_partial) )
+            {
+                /* Someone else is updating validation of this page. Wait... */
+                while ( (y = page->u.inuse.type_info) == x )
+                {
+                    if ( preemptible && hypercall_preempt_check() )
+                        return -EINTR;
+                    cpu_relax();
+                }
+                continue;
+            }
+            /* Type ref count was left at 1 when PGT_partial got set. */
+            ASSERT((x & PGT_count_mask) == 1);
+            nx = x & ~PGT_partial;
         }
+
+        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+            break;
+
+        if ( preemptible && hypercall_preempt_check() )
+            return -EINTR;
     }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
 
     if ( unlikely(!(nx & PGT_validated)) )
     {
-        /* Try to validate page type; drop the new reference on failure. */
-        if ( unlikely(!alloc_page_type(page, type)) )
+        if ( !(x & PGT_partial) )
         {
-            MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
-                    PRtype_info ": caf=%08x taf=%" PRtype_info,
-                    page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
-                    type, page->count_info, page->u.inuse.type_info);
-            /* Noone else can get a reference. We hold the only ref. */
-            page->u.inuse.type_info = 0;
-            return 0;
+            page->nr_validated_ptes = 0;
+            page->partial_pte = 0;
         }
-
-        /* Noone else is updating simultaneously. */
-        __set_bit(_PGT_validated, &page->u.inuse.type_info);
+        rc = alloc_page_type(page, type, preemptible);
     }
 
-    return 1;
+    if ( (x & PGT_partial) && !(nx & PGT_partial) )
+        put_page(page);
+
+    return rc;
+}
+
+void put_page_type(struct page_info *page)
+{
+    int rc = __put_page_type(page, 0);
+    ASSERT(rc == 0);
+    (void)rc;
 }
 
+int get_page_type(struct page_info *page, unsigned long type)
+{
+    int rc = __get_page_type(page, type, 0);
+    if ( likely(rc == 0) )
+        return 1;
+    ASSERT(rc == -EINVAL);
+    return 0;
+}
+
+int put_page_type_preemptible(struct page_info *page)
+{
+    return __put_page_type(page, 1);
+}
+
+int get_page_type_preemptible(struct page_info *page, unsigned long type)
+{
+    return __get_page_type(page, type, 1);
+}
 
 int new_guest_cr3(unsigned long mfn)
 {
@@ -1826,7 +2127,7 @@ int new_guest_cr3(unsigned long mfn)
                     l4e_from_pfn(
                         mfn,
                         (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
-                    pagetable_get_pfn(v->arch.guest_table));
+                    pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
         if ( unlikely(!okay) )
         {
             MEM_LOG("Error while installing new compat baseptr %lx", mfn);
@@ -1841,7 +2142,7 @@ int new_guest_cr3(unsigned long mfn)
 #endif
     okay = paging_mode_refcounts(d)
         ? get_page_from_pagenr(mfn, d)
-        : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
     if ( unlikely(!okay) )
     {
         MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -1962,6 +2263,12 @@ static inline cpumask_t vcpumask_to_pcpu
     cpumask_t    pmask = CPU_MASK_NONE;
     struct vcpu *v;
 
+    /*
+     * Callers copy only a single guest-sized longword from the guest.
+     * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
+     */
+    BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
+
     while ( vmask != 0 )
     {
         vcpu_id = find_first_set_bit(vmask);
@@ -2015,9 +2322,7 @@ int do_mmuext_op(
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall_create_continuation(
-                __HYPERVISOR_mmuext_op, "hihi",
-                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = -EAGAIN;
             break;
         }
 
@@ -2060,10 +2365,14 @@ int do_mmuext_op(
             if ( paging_mode_refcounts(FOREIGNDOM) )
                 break;
 
-            okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
+            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
+            okay = !rc;
             if ( unlikely(!okay) )
             {
-                MEM_LOG("Error while pinning mfn %lx", mfn);
+                if ( rc == -EINTR )
+                    rc = -EAGAIN;
+                else if ( rc != -EAGAIN )
+                    MEM_LOG("Error while pinning mfn %lx", mfn);
                 break;
             }
 
@@ -2108,8 +2417,11 @@ int do_mmuext_op(
             {
                 put_page_and_type(page);
                 put_page(page);
-                /* A page is dirtied when its pin status is cleared. */
-                paging_mark_dirty(d, mfn);
+                if ( !rc )
+                {
+                    /* A page is dirtied when its pin status is cleared. */
+                    paging_mark_dirty(d, mfn);
+                }
             }
             else
             {
@@ -2133,8 +2445,8 @@ int do_mmuext_op(
                 if ( paging_mode_refcounts(d) )
                     okay = get_page_from_pagenr(mfn, d);
                 else
-                    okay = get_page_and_type_from_pagenr(
-                        mfn, PGT_root_page_table, d);
+                    okay = !get_page_and_type_from_pagenr(
+                        mfn, PGT_root_page_table, d, 0, 0);
                 if ( unlikely(!okay) )
                 {
                     MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2253,6 +2565,11 @@ int do_mmuext_op(
         guest_handle_add_offset(uops, 1);
     }
 
+    if ( rc == -EAGAIN )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_mmuext_op, "hihi",
+            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+
     process_deferred_ops();
 
     UNLOCK_BIGLOCK(d);
@@ -2316,9 +2633,7 @@ int do_mmu_update(
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall_create_continuation(
-                __HYPERVISOR_mmu_update, "hihi",
-                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = -EAGAIN;
             break;
         }
 
@@ -2336,9 +2651,12 @@ int do_mmu_update(
         {
             /*
              * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
+             * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
+             * current A/D bits.
              */
         case MMU_NORMAL_PT_UPDATE:
-
+        case MMU_PT_UPDATE_PRESERVE_AD:
+            req.ptr -= cmd;
             gmfn = req.ptr >> PAGE_SHIFT;
             mfn = gmfn_to_mfn(d, gmfn);
 
@@ -2375,20 +2693,24 @@ int do_mmu_update(
                 case PGT_l1_page_table:
                 {
                     l1_pgentry_t l1e = l1e_from_intpte(req.val);
-                    okay = mod_l1_entry(va, l1e, mfn);
+                    okay = mod_l1_entry(va, l1e, mfn,
+                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
                 }
                 break;
                 case PGT_l2_page_table:
                 {
                     l2_pgentry_t l2e = l2e_from_intpte(req.val);
-                    okay = mod_l2_entry(va, l2e, mfn, type_info);
+                    okay = mod_l2_entry(va, l2e, mfn, type_info,
+                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
                 }
                 break;
 #if CONFIG_PAGING_LEVELS >= 3
                 case PGT_l3_page_table:
                 {
                     l3_pgentry_t l3e = l3e_from_intpte(req.val);
-                    okay = mod_l3_entry(va, l3e, mfn);
+                    rc = mod_l3_entry(va, l3e, mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+                    okay = !rc;
                 }
                 break;
 #endif
@@ -2396,13 +2718,17 @@ int do_mmu_update(
                 case PGT_l4_page_table:
                 {
                     l4_pgentry_t l4e = l4e_from_intpte(req.val);
-                    okay = mod_l4_entry(d, va, l4e, mfn);
+                    rc = mod_l4_entry(d, va, l4e, mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+                    okay = !rc;
                 }
                 break;
 #endif
                 }
 
                 put_page_type(page);
+                if ( rc == -EINTR)
+                    rc = -EAGAIN;
             }
             break;
 
@@ -2465,6 +2791,11 @@ int do_mmu_update(
         guest_handle_add_offset(ureqs, 1);
     }
 
+    if ( rc == -EAGAIN )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_mmu_update, "hihi",
+            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+
     process_deferred_ops();
 
     UNLOCK_BIGLOCK(d);
@@ -2522,7 +2853,7 @@ static int create_grant_pte_mapping(
     }
 
     ol1e = *(l1_pgentry_t *)va;
-    if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) )
+    if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
     {
         put_page_type(page);
         rc = GNTST_general_error;
@@ -2590,9 +2921,11 @@ static int destroy_grant_pte_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!UPDATE_ENTRY(l1, 
-                      (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, 
-                      d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
+    if ( unlikely(!UPDATE_ENTRY
+                  (l1, 
+                   (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, 
+                   d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
+                   0)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", va);
         put_page_type(page);
@@ -2628,7 +2961,7 @@ static int create_grant_va_mapping(
         return GNTST_general_error;
     }
     ol1e = *pl1e;
-    okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
+    okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
     guest_unmap_l1e(v, pl1e);
     pl1e = NULL;
 
@@ -2666,7 +2999,7 @@ static int destroy_grant_va_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, v)) )
+    if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, v, 0)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
         rc = GNTST_general_error;
@@ -2768,7 +3101,7 @@ int do_update_va_mapping(unsigned long v
 
     pl1e = guest_map_l1e(v, va, &gl1mfn);
 
-    if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
+    if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
         rc = -EINVAL;
 
     if ( pl1e )
@@ -3061,7 +3394,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_frame(mfn_to_page(prev_mfn)) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3070,10 +3403,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
         UNLOCK_BIGLOCK(d);
 
@@ -3318,7 +3651,7 @@ static int ptwr_emulated_update(
     else
     {
         ol1e = *pl1e;
-        if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) )
+        if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
             BUG();
     }
 
diff -Naurp xen/arch/x86/msi.c xen-redhat/arch/x86/msi.c
--- xen/arch/x86/msi.c
+++ xen-redhat/arch/x86/msi.c
@@ -0,0 +1,858 @@
+/*
+ * File:    msi.c
+ * Purpose: PCI Message Signaled Interrupt (MSI)
+ *
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/irq.h>
+#include <xen/delay.h>
+#include <xen/sched.h>
+#include <xen/acpi.h>
+#include <xen/errno.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/keyhandler.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/msi.h>
+#include <asm/fixmap.h>
+#include <mach_apic.h>
+#include <io_ports.h>
+#include <public/physdev.h>
+#include <xen/iommu.h>
+
+/* bitmap indicate which fixed map is free */
+DEFINE_SPINLOCK(msix_fixmap_lock);
+DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
+
+static int msix_fixmap_alloc(void)
+{
+    int i, rc = -ENOMEM;
+
+    spin_lock(&msix_fixmap_lock);
+    for ( i = 0; i < FIX_MSIX_MAX_PAGES; i++ )
+        if ( !test_bit(i, &msix_fixmap_pages) )
+            break;
+    if ( i == FIX_MSIX_MAX_PAGES )
+        goto out;
+    rc = FIX_MSIX_IO_RESERV_BASE + i;
+    set_bit(i, &msix_fixmap_pages);
+
+ out:
+    spin_unlock(&msix_fixmap_lock);
+    return rc;
+}
+
+static void msix_fixmap_free(int idx)
+{
+    spin_lock(&msix_fixmap_lock);
+    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
+        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
+    spin_unlock(&msix_fixmap_lock);
+}
+
+static int msix_get_fixmap(struct pci_dev *dev, unsigned long table_paddr,
+                           unsigned long entry_paddr)
+{
+    int nr_page, idx;
+
+    nr_page = (entry_paddr >> PAGE_SHIFT) - (table_paddr >> PAGE_SHIFT);
+
+    if ( nr_page < 0 || nr_page >= MAX_MSIX_TABLE_PAGES )
+        return -EINVAL;
+
+    spin_lock(&dev->msix_table_lock);
+    if ( dev->msix_table_refcnt[nr_page]++ == 0 )
+    {
+        idx = msix_fixmap_alloc();
+        if ( idx < 0 )
+        {
+            dev->msix_table_refcnt[nr_page]--;
+            goto out;
+        }
+        set_fixmap_nocache(idx, entry_paddr);
+        dev->msix_table_idx[nr_page] = idx;
+    }
+    else
+        idx = dev->msix_table_idx[nr_page];
+
+ out:
+    spin_unlock(&dev->msix_table_lock);
+    return idx;
+}
+
+static void msix_put_fixmap(struct pci_dev *dev, int idx)
+{
+    int i;
+    unsigned long start;
+
+    spin_lock(&dev->msix_table_lock);
+    for ( i = 0; i < MAX_MSIX_TABLE_PAGES; i++ )
+    {
+        if ( dev->msix_table_idx[i] == idx )
+            break;
+    }
+    if ( i == MAX_MSIX_TABLE_PAGES )
+        goto out;
+
+    if ( --dev->msix_table_refcnt[i] == 0 )
+    {
+        start = fix_to_virt(idx);
+        destroy_xen_mappings(start, start + PAGE_SIZE);
+        msix_fixmap_free(idx);
+        dev->msix_table_idx[i] = 0;
+    }
+
+ out:
+    spin_unlock(&dev->msix_table_lock);
+}
+
+/*
+ * MSI message composition
+ */
+static void msi_compose_msg(struct pci_dev *pdev, int vector,
+                            struct msi_msg *msg)
+{
+    unsigned dest;
+    cpumask_t tmp;
+
+    tmp = TARGET_CPUS;
+    if ( vector )
+    {
+        dest = cpu_mask_to_apicid(tmp);
+
+        msg->address_hi = MSI_ADDR_BASE_HI;
+        msg->address_lo =
+            MSI_ADDR_BASE_LO |
+            ((INT_DEST_MODE == 0) ?
+             MSI_ADDR_DESTMODE_PHYS:
+             MSI_ADDR_DESTMODE_LOGIC) |
+            ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+             MSI_ADDR_REDIRECTION_CPU:
+             MSI_ADDR_REDIRECTION_LOWPRI) |
+            MSI_ADDR_DEST_ID(dest);
+
+        msg->data =
+            MSI_DATA_TRIGGER_EDGE |
+            MSI_DATA_LEVEL_ASSERT |
+            ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+             MSI_DATA_DELIVERY_FIXED:
+             MSI_DATA_DELIVERY_LOWPRI) |
+            MSI_DATA_VECTOR(vector);
+    }
+}
+
+static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+    switch ( entry->msi_attrib.type )
+    {
+    case PCI_CAP_ID_MSI:
+    {
+        struct pci_dev *dev = entry->dev;
+        int pos = entry->msi_attrib.pos;
+        u16 data;
+        u8 bus = dev->bus;
+        u8 slot = PCI_SLOT(dev->devfn);
+        u8 func = PCI_FUNC(dev->devfn);
+
+        msg->address_lo = pci_conf_read32(bus, slot, func,
+                                          msi_lower_address_reg(pos));
+        if ( entry->msi_attrib.is_64 )
+        {
+            msg->address_hi = pci_conf_read32(bus, slot, func,
+                                              msi_upper_address_reg(pos));
+            data = pci_conf_read16(bus, slot, func, msi_data_reg(pos, 1));
+        }
+        else
+        {
+            msg->address_hi = 0;
+            data = pci_conf_read16(bus, slot, func, msi_data_reg(pos, 0));
+        }
+        msg->data = data;
+        break;
+    }
+    case PCI_CAP_ID_MSIX:
+    {
+        void __iomem *base;
+        base = entry->mask_base;
+
+        msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+        msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+        msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+        break;
+    }
+    default:
+        BUG();
+    }
+
+    if ( vtd_enabled )
+        msi_msg_read_remap_rte(entry, msg);
+}
+
+static int set_vector_msi(struct msi_desc *entry)
+{
+    if ( entry->vector >= NR_VECTORS )
+    {
+        dprintk(XENLOG_ERR, "Trying to install msi data for Vector %d\n",
+                entry->vector);
+        return -EINVAL;
+    }
+
+    irq_desc[entry->vector].msi_desc = entry;
+    return 0;
+}
+
+static int unset_vector_msi(int vector)
+{
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+
+    if ( vector >= NR_VECTORS )
+    {
+        dprintk(XENLOG_ERR, "Trying to uninstall msi data for Vector %d\n",
+                vector);
+        return -EINVAL;
+    }
+
+    irq_desc[vector].msi_desc = NULL;
+
+    return 0;
+}
+
+static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+    if ( iommu_enabled )
+        iommu_update_ire_from_msi(entry, msg);
+
+    switch ( entry->msi_attrib.type )
+    {
+    case PCI_CAP_ID_MSI:
+    {
+        struct pci_dev *dev = entry->dev;
+        int pos = entry->msi_attrib.pos;
+        u8 bus = dev->bus;
+        u8 slot = PCI_SLOT(dev->devfn);
+        u8 func = PCI_FUNC(dev->devfn);
+
+        pci_conf_write32(bus, slot, func, msi_lower_address_reg(pos),
+                         msg->address_lo);
+        if ( entry->msi_attrib.is_64 )
+        {
+            pci_conf_write32(bus, slot, func, msi_upper_address_reg(pos),
+                             msg->address_hi);
+            pci_conf_write16(bus, slot, func, msi_data_reg(pos, 1),
+                             msg->data);
+        }
+        else
+            pci_conf_write16(bus, slot, func, msi_data_reg(pos, 0),
+                             msg->data);
+        break;
+    }
+    case PCI_CAP_ID_MSIX:
+    {
+        void __iomem *base;
+        base = entry->mask_base;
+
+        writel(msg->address_lo,
+               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+        writel(msg->address_hi,
+               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+        writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+        break;
+    }
+    default:
+        BUG();
+    }
+    entry->msg = *msg;
+}
+
+void set_msi_affinity(unsigned int vector, cpumask_t mask)
+{
+    struct msi_desc *desc = irq_desc[vector].msi_desc;
+    struct msi_msg msg;
+    unsigned int dest;
+
+    memset(&msg, 0, sizeof(msg));
+    cpus_and(mask, mask, cpu_online_map);
+    if ( cpus_empty(mask) )
+        mask = TARGET_CPUS;
+    dest = cpu_mask_to_apicid(mask);
+
+    if ( !desc )
+        return;
+
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+    read_msi_msg(desc, &msg);
+
+    msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+    msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+    write_msi_msg(desc, &msg);
+}
+
+static void msi_set_enable(struct pci_dev *dev, int enable)
+{
+    int pos;
+    u16 control;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
+    if ( pos )
+    {
+        control = pci_conf_read16(bus, slot, func, pos + PCI_MSI_FLAGS);
+        control &= ~PCI_MSI_FLAGS_ENABLE;
+        if ( enable )
+            control |= PCI_MSI_FLAGS_ENABLE;
+        pci_conf_write16(bus, slot, func, pos + PCI_MSI_FLAGS, control);
+    }
+}
+
+static void msix_set_enable(struct pci_dev *dev, int enable)
+{
+    int pos;
+    u16 control;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( pos )
+    {
+        control = pci_conf_read16(bus, slot, func, pos + PCI_MSIX_FLAGS);
+        control &= ~PCI_MSIX_FLAGS_ENABLE;
+        if ( enable )
+            control |= PCI_MSIX_FLAGS_ENABLE;
+        pci_conf_write16(bus, slot, func, pos + PCI_MSIX_FLAGS, control);
+    }
+}
+
+static void msix_flush_writes(unsigned int vector)
+{
+    struct msi_desc *entry = irq_desc[vector].msi_desc;
+
+    BUG_ON(!entry || !entry->dev);
+    switch (entry->msi_attrib.type) {
+    case PCI_CAP_ID_MSI:
+        /* nothing to do */
+        break;
+    case PCI_CAP_ID_MSIX:
+    {
+        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+        readl(entry->mask_base + offset);
+        break;
+    }
+    default:
+        BUG();
+        break;
+    }
+}
+
+int msi_maskable_irq(const struct msi_desc *entry)
+{
+    BUG_ON(!entry);
+    return entry->msi_attrib.type != PCI_CAP_ID_MSI
+           || entry->msi_attrib.maskbit;
+}
+
+static void msi_set_mask_bit(unsigned int vector, int flag)
+{
+    struct msi_desc *entry = irq_desc[vector].msi_desc;
+
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+    BUG_ON(!entry || !entry->dev);
+    switch (entry->msi_attrib.type) {
+    case PCI_CAP_ID_MSI:
+        if (entry->msi_attrib.maskbit) {
+            int pos;
+            u32 mask_bits;
+            u8 bus = entry->dev->bus;
+            u8 slot = PCI_SLOT(entry->dev->devfn);
+            u8 func = PCI_FUNC(entry->dev->devfn);
+
+            pos = (long)entry->mask_base;
+            mask_bits = pci_conf_read32(bus, slot, func, pos);
+            mask_bits &= ~(1);
+            mask_bits |= flag;
+            pci_conf_write32(bus, slot, func, pos, mask_bits);
+        }
+        break;
+    case PCI_CAP_ID_MSIX:
+    {
+        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+        writel(flag, entry->mask_base + offset);
+        readl(entry->mask_base + offset);
+        break;
+    }
+    default:
+        BUG();
+        break;
+    }
+    entry->msi_attrib.masked = !!flag;
+}
+
+void mask_msi_vector(unsigned int vector)
+{
+    msi_set_mask_bit(vector, 1);
+    msix_flush_writes(vector);
+}
+
+void unmask_msi_vector(unsigned int vector)
+{
+    msi_set_mask_bit(vector, 0);
+    msix_flush_writes(vector);
+}
+
+static struct msi_desc* alloc_msi_entry(void)
+{
+    struct msi_desc *entry;
+
+    entry = xmalloc(struct msi_desc);
+    if ( !entry )
+        return NULL;
+
+    INIT_LIST_HEAD(&entry->list);
+    entry->dev = NULL;
+    entry->remap_index = -1;
+
+    return entry;
+}
+
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+    struct msi_msg msg;
+
+    msi_compose_msg(dev, desc->vector, &msg);
+    set_vector_msi(desc);
+    write_msi_msg(irq_desc[desc->vector].msi_desc, &msg);
+
+    return 0;
+}
+
+void teardown_msi_vector(int vector)
+{
+    unset_vector_msi(vector);
+}
+
+int msi_free_vector(struct msi_desc *entry)
+{
+    if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+    {
+        unsigned long start;
+
+        writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+        start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
+        msix_put_fixmap(entry->dev, virt_to_fix(start));
+    }
+
+    /* Free the unused IRTE if intr remap enabled */
+    if ( iommu_enabled )
+        iommu_update_ire_from_msi(entry, NULL);
+
+    list_del(&entry->list);
+    free_irq_vector(entry->vector);
+    xfree(entry);
+    return 0;
+}
+
+static struct msi_desc *find_msi_entry(struct pci_dev *dev,
+                                       int vector, int cap_id)
+{
+    struct msi_desc *entry;
+
+    list_for_each_entry( entry, &dev->msi_list, list )
+    {
+        if ( entry->msi_attrib.type == cap_id &&
+             (vector == -1 || entry->vector == vector) )
+            return entry;
+    }
+
+    return NULL;
+}
+
+/**
+ * msi_capability_init - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ *
+ * Setup the MSI capability structure of device function with a single
+ * MSI irq, regardless of device function is capable of handling
+ * multiple messages. A return of zero indicates the successful setup
+ * of an entry zero with the new MSI irq or non-zero for otherwise.
+ **/
+static int msi_capability_init(struct pci_dev *dev,
+                               int vector,
+                               struct msi_desc **desc)
+{
+    struct msi_desc *entry;
+    int pos;
+    u16 control;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
+    control = pci_conf_read16(bus, slot, func, msi_control_reg(pos));
+    /* MSI Entry Initialization */
+    msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */
+
+    entry = alloc_msi_entry();
+    if ( !entry )
+        return -ENOMEM;
+
+    entry->msi_attrib.type = PCI_CAP_ID_MSI;
+    entry->msi_attrib.is_64 = is_64bit_address(control);
+    entry->msi_attrib.entry_nr = 0;
+    entry->msi_attrib.maskbit = is_mask_bit_support(control);
+    entry->msi_attrib.masked = 1;
+    entry->msi_attrib.pos = pos;
+    entry->vector = vector;
+    if ( is_mask_bit_support(control) )
+        entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
+                                                                   is_64bit_address(control));
+    entry->dev = dev;
+    if ( entry->msi_attrib.maskbit )
+    {
+        unsigned int maskbits, temp;
+        /* All MSIs are unmasked by default, Mask them all */
+        maskbits = pci_conf_read32(bus, slot, func,
+                                   msi_mask_bits_reg(pos, is_64bit_address(control)));
+        temp = (1 << multi_msi_capable(control));
+        temp = ((temp - 1) & ~temp);
+        maskbits |= temp;
+        pci_conf_write32(bus, slot, func,
+                         msi_mask_bits_reg(pos, is_64bit_address(control)),
+                         maskbits);
+    }
+    list_add_tail(&entry->list, &dev->msi_list);
+
+    *desc = entry;
+    /* Restore the original MSI enabled bits  */
+    pci_conf_write16(bus, slot, func, msi_control_reg(pos), control);
+
+    return 0;
+}
+
+/**
+ * msix_capability_init - configure device's MSI-X capability
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of struct msix_entry entries
+ * @nvec: number of @entries
+ *
+ * Setup the MSI-X capability structure of device function with a
+ * single MSI-X irq. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ **/
+static int msix_capability_init(struct pci_dev *dev,
+                                struct msi_info *msi,
+                                struct msi_desc **desc)
+{
+    struct msi_desc *entry;
+    int pos;
+    u16 control;
+    unsigned long table_paddr, entry_paddr;
+    u32 table_offset, entry_offset;
+    u8 bir;
+    void __iomem *base;
+    int idx;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(desc);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+    msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+
+    /* MSI-X Table Initialization */
+    entry = alloc_msi_entry();
+    if ( !entry )
+        return -ENOMEM;
+
+    /* Request & Map MSI-X table region */
+    table_offset = pci_conf_read32(bus, slot, func, msix_table_offset_reg(pos));
+    bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+    table_offset &= ~PCI_MSIX_FLAGS_BIRMASK;
+    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+    table_paddr = msi->table_base + table_offset;
+    entry_paddr = table_paddr + entry_offset;
+    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
+    if ( idx < 0 )
+    {
+        xfree(entry);
+        return idx;
+    }
+    base = (void *)(fix_to_virt(idx) + (entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
+
+    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+    entry->msi_attrib.is_64 = 1;
+    entry->msi_attrib.entry_nr = msi->entry_nr;
+    entry->msi_attrib.maskbit = 1;
+    entry->msi_attrib.masked = 1;
+    entry->msi_attrib.pos = pos;
+    entry->vector = msi->vector;
+    entry->dev = dev;
+    entry->mask_base = base;
+
+    list_add_tail(&entry->list, &dev->msi_list);
+
+    /* Mask interrupt here */
+    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+    *desc = entry;
+    /* Restore MSI-X enabled bits */
+    pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
+
+    return 0;
+}
+
+/**
+ * pci_enable_msi - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ *
+ * Setup the MSI capability structure of device function with
+ * a single MSI irq upon its software driver call to request for
+ * MSI mode enabled on its hardware device function. A return of zero
+ * indicates the successful setup of an entry zero with the new MSI
+ * irq or non-zero for otherwise.
+ **/
+static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
+{
+    int status;
+    struct pci_dev *pdev;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(msi->bus, msi->devfn);
+    if ( !pdev )
+        return -ENODEV;
+
+    if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
+    {
+        dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        return 0;
+    }
+
+    status = msi_capability_init(pdev, msi->vector, desc);
+    return status;
+}
+
+static void __pci_disable_msi(struct msi_desc *entry)
+{
+    struct pci_dev *dev;
+    int pos;
+    u16 control;
+    u8 bus, slot, func;
+
+    dev = entry->dev;
+    bus = dev->bus;
+    slot = PCI_SLOT(dev->devfn);
+    func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
+    control = pci_conf_read16(bus, slot, func, msi_control_reg(pos));
+    msi_set_enable(dev, 0);
+
+    BUG_ON(list_empty(&dev->msi_list));
+
+}
+
+/**
+ * pci_enable_msix - configure device's MSI-X capability structure
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of MSI-X entries
+ * @nvec: number of MSI-X irqs requested for allocation by device driver
+ *
+ * Setup the MSI-X capability structure of device function with the number
+ * of requested irqs upon its software driver call to request for
+ * MSI-X mode enabled on its hardware device function. A return of zero
+ * indicates the successful configuration of MSI-X capability structure
+ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
+ * Or a return of > 0 indicates that driver request is exceeding the number
+ * of irqs available. Driver should use the returned value to re-send
+ * its request.
+ **/
+static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
+{
+    int status, pos, nr_entries;
+    struct pci_dev *pdev;
+    u16 control;
+    u8 slot = PCI_SLOT(msi->devfn);
+    u8 func = PCI_FUNC(msi->devfn);
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(msi->bus, msi->devfn);
+    if ( !pdev )
+        return -ENODEV;
+
+    pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
+    control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
+    nr_entries = multi_msix_capable(control);
+    if (msi->entry_nr >= nr_entries)
+        return -EINVAL;
+
+    if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
+    {
+        dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        return 0;
+    }
+
+    status = msix_capability_init(pdev, msi, desc);
+    return status;
+}
+
+static void __pci_disable_msix(struct msi_desc *entry)
+{
+    struct pci_dev *dev;
+    int pos;
+    u16 control;
+    u8 bus, slot, func;
+
+    dev = entry->dev;
+    bus = dev->bus;
+    slot = PCI_SLOT(dev->devfn);
+    func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+    msi_set_enable(dev, 0);
+
+    BUG_ON(list_empty(&dev->msi_list));
+
+    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+    pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
+}
+
+/*
+ * Notice: only construct the msi_desc
+ * no change to irq_desc here, and the interrupt is masked
+ */
+int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
+{
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    return  msi->table_base ? __pci_enable_msix(msi, desc) :
+        __pci_enable_msi(msi, desc);
+}
+
+/*
+ * Device only, no irq_desc
+ */
+void pci_disable_msi(struct msi_desc *msi_desc)
+{
+    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+        __pci_disable_msi(msi_desc);
+    else if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSIX )
+        __pci_disable_msix(msi_desc);
+}
+
+static void msi_free_vectors(struct pci_dev* dev)
+{
+    struct msi_desc *entry, *tmp;
+    irq_desc_t *desc;
+    unsigned long flags, vector;
+
+    list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
+    {
+        vector = entry->vector;
+        desc = &irq_desc[vector];
+        pci_disable_msi(entry);
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        teardown_msi_vector(vector);
+
+        if ( desc->handler == &pci_msi_type )
+        {
+            /* MSI is not shared, so should be released already */
+            BUG_ON(desc->status & IRQ_GUEST);
+            desc->handler = &no_irq_type;
+        }
+
+        spin_unlock_irqrestore(&desc->lock, flags);
+        msi_free_vector(entry);
+    }
+}
+
+void pci_cleanup_msi(struct pci_dev *pdev)
+{
+    /* Disable MSI and/or MSI-X */
+    msi_set_enable(pdev, 0);
+    msix_set_enable(pdev, 0);
+    msi_free_vectors(pdev);
+}
+
+int pci_restore_msi_state(struct pci_dev *pdev)
+{
+    unsigned long flags;
+    int vector;
+    struct msi_desc *entry, *tmp;
+    irq_desc_t *desc;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    if (!pdev)
+        return -EINVAL;
+
+    list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
+    {
+        vector = entry->vector;
+        desc = &irq_desc[vector];
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        ASSERT(desc->msi_desc == entry);
+
+        if (desc->msi_desc != entry)
+        {
+            dprintk(XENLOG_ERR, "Restore MSI for dev %x:%x not set before?\n",
+                                pdev->bus, pdev->devfn);
+            spin_unlock_irqrestore(&desc->lock, flags);
+            return -EINVAL;
+        }
+
+        msi_set_enable(pdev, 0);
+        write_msi_msg(entry, &entry->msg);
+
+        msi_set_enable(pdev, 1);
+        msi_set_mask_bit(vector, entry->msi_attrib.masked);
+        spin_unlock_irqrestore(&desc->lock, flags);
+    }
+
+    return 0;
+}
+
+unsigned int pci_msix_get_table_len(struct pci_dev *pdev)
+{
+    int pos;
+    u16 control;
+    u8 bus, slot, func;
+    unsigned int len;
+
+    bus = pdev->bus;
+    slot = PCI_SLOT(pdev->devfn);
+    func = PCI_FUNC(pdev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !pos )
+        return 0;
+
+    control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+    len = msix_table_size(control) * PCI_MSIX_ENTRY_SIZE;
+
+    return len;
+}
diff -Naurp xen/arch/x86/nmi.c xen-redhat/arch/x86/nmi.c
--- xen/arch/x86/nmi.c
+++ xen-redhat/arch/x86/nmi.c
@@ -73,7 +73,7 @@ int nmi_active;
 #define P6_EVNTSEL_OS		(1 << 17)
 #define P6_EVNTSEL_USR		(1 << 16)
 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
-#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+#define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c
 
 #define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
 #define P4_CCCR_OVF_PMI0	(1<<26)
@@ -248,7 +248,7 @@ static void __pminit setup_k7_watchdog(v
     wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
 }
 
-static void __pminit setup_p6_watchdog(void)
+static void __pminit setup_p6_watchdog(unsigned counter)
 {
     unsigned int evntsel;
 
@@ -260,7 +260,7 @@ static void __pminit setup_p6_watchdog(v
     evntsel = P6_EVNTSEL_INT
         | P6_EVNTSEL_OS
         | P6_EVNTSEL_USR
-        | P6_NMI_EVENT;
+        | counter;
 
     wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
     write_watchdog_counter("P6_PERFCTR0");
@@ -314,14 +314,21 @@ void __pminit setup_apic_nmi_watchdog(vo
 
     switch (boot_cpu_data.x86_vendor) {
     case X86_VENDOR_AMD:
-        if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+        switch (boot_cpu_data.x86) {
+	case 6:
+	case 0xf ... 0x17:
+	        setup_k7_watchdog();
+		break;
+	default:
             return;
-        setup_k7_watchdog();
+	}
         break;
     case X86_VENDOR_INTEL:
         switch (boot_cpu_data.x86) {
         case 6:
-            setup_p6_watchdog();
+            setup_p6_watchdog((boot_cpu_data.x86_model < 14) 
+                              ? P6_EVENT_CPU_CLOCKS_NOT_HALTED
+                              : CORE_EVENT_CPU_CLOCKS_NOT_HALTED);
             break;
         case 15:
             if (!setup_p4_watchdog())
diff -Naurp xen/arch/x86/numa.c xen-redhat/arch/x86/numa.c
--- xen/arch/x86/numa.c
+++ xen-redhat/arch/x86/numa.c
@@ -57,7 +57,7 @@ populate_memnodemap(const struct node *n
 {
 	int i; 
 	int res = -1;
-	unsigned long addr, end;
+	paddr_t addr, end;
 
 	if (shift >= 64)
 		return -1;
@@ -286,13 +286,13 @@ static void dump_numa(unsigned char key)
 		  (u32)(now>>32), (u32)now);
 
 	for_each_online_node(i) {
-		unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
+		paddr_t pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
 		printk("idx%d -> NODE%d start->%lu size->%lu\n",
 			  i, NODE_DATA(i)->node_id,
 			  NODE_DATA(i)->node_start_pfn,
 			  NODE_DATA(i)->node_spanned_pages);
 		/* sanity check phys_to_nid() */
-		printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa),
+		printk("phys_to_nid(%"PRIpaddr") -> %d should be %d\n", pa, phys_to_nid(pa),
 			  NODE_DATA(i)->node_id);
 	}
 	for_each_online_cpu(i)
diff -Naurp xen/arch/x86/oprofile/nmi_int.c xen-redhat/arch/x86/oprofile/nmi_int.c
--- xen/arch/x86/oprofile/nmi_int.c
+++ xen-redhat/arch/x86/oprofile/nmi_int.c
@@ -291,37 +291,77 @@ static int __init p4_init(char ** cpu_ty
 }
 
 
+static int force_arch_perfmon;
+static int force_cpu_type(const char *str)
+{
+	if (!strcmp(str, "arch_perfmon")) {
+		force_arch_perfmon = 1;
+		printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+	}
+
+	return 0;
+}
+custom_param("cpu_type", force_cpu_type);
+
 static int __init ppro_init(char ** cpu_type)
 {
 	__u8 cpu_model = current_cpu_data.x86_model;
 
-	if (cpu_model > 15) {
-		printk("xenoprof: Initialization failed. "
-		       "Intel processor model %d for P6 class family is not "
-		       "supported\n", cpu_model);
+	if (force_arch_perfmon && cpu_has_arch_perfmon)
 		return 0;
-	}
-	else if (cpu_model == 15)
-		*cpu_type = "i386/core_2";
-	else if (cpu_model == 14)
-		*cpu_type = "i386/core";
-	else if (cpu_model == 9)
-		*cpu_type = "i386/p6_mobile";
-	else if (cpu_model > 5)
-		*cpu_type = "i386/piii";
-	else if (cpu_model > 2)
-		*cpu_type = "i386/pii";
-	else
+
+	switch (cpu_model) {
+	case 0 ... 2:
 		*cpu_type = "i386/ppro";
+		break;
+	case 3 ... 5:
+		*cpu_type = "i386/pii";
+		break;
+	case 6 ... 8:
+	case 10 ... 11:
+		*cpu_type = "i386/piii";
+		break;
+	case 9:
+	case 13:
+		*cpu_type = "i386/p6_mobile";
+		break;
+	case 14:
+		*cpu_type = "i386/core";
+		break;
+	case 15: case 23: case 29:
+		*cpu_type = "i386/core_2";
+		break;
+	case 26:
+		arch_perfmon_setup_counters();
+		*cpu_type = "i386/core_i7";
+		break;
+	case 28:
+		*cpu_type = "i386/atom";
+		break;
+	default:
+		/* Unknown */
+		return 0;
+	}
 
 	model = &op_ppro_spec;
 	return 1;
 }
 
+static int __init arch_perfmon_init(char **cpu_type)
+{
+	if (!cpu_has_arch_perfmon)
+		return 0;
+	*cpu_type = "i386/arch_perfmon";
+	model = &op_arch_perfmon_spec;
+	arch_perfmon_setup_counters();
+	return 1;
+}
+
 static int __init nmi_init(void)
 {
 	__u8 vendor = current_cpu_data.x86_vendor;
 	__u8 family = current_cpu_data.x86;
+	__u8 _model = current_cpu_data.x86_model;
  
 	if (!cpu_has_apic) {
 		printk("xenoprof: Initialization failed. No APIC\n");
@@ -348,6 +388,26 @@ static int __init nmi_init(void)
 				   give user space an consistent name. */
 				cpu_type = "x86-64/hammer";
 				break;
+			case 0x10:
+				model = &op_athlon_spec;
+				cpu_type = "x86-64/family10";
+				break;
+			case 0x11:
+				model = &op_athlon_spec;
+				cpu_type = "x86-64/family11";
+				break;
+			case 0x12:
+				model = &op_athlon_spec;
+				cpu_type = "x86-64/family12";
+				break;
+			case 0x14:
+				model = &op_athlon_spec;
+				cpu_type = "x86-64/family14";
+				break;
+			case 0x15:
+				model = &op_athlon_spec;
+				cpu_type = "x86-64/family15";
+				break;
 			}
 			break;
  
@@ -355,21 +415,22 @@ static int __init nmi_init(void)
 			switch (family) {
 				/* Pentium IV */
 				case 0xf:
-					if (!p4_init(&cpu_type))
-						return -ENODEV;
+					p4_init(&cpu_type);
 					break;
 
 				/* A P6-class processor */
 				case 6:
-					if (!ppro_init(&cpu_type))
-						return -ENODEV;
+					ppro_init(&cpu_type);
 					break;
 
 				default:
+				break;
+			}
+			if (!cpu_type && !arch_perfmon_init(&cpu_type)) {
 				printk("xenoprof: Initialization failed. "
-				       "Intel processor family %d is not "
-				       "supported\n", family);
-					return -ENODEV;
+				       "Intel processor family %d model %d"
+				       "is not supported\n", family, _model);
+				return -ENODEV;
 			}
 			break;
 
diff -Naurp xen/arch/x86/oprofile/op_model_athlon.c xen-redhat/arch/x86/oprofile/op_model_athlon.c
--- xen/arch/x86/oprofile/op_model_athlon.c
+++ xen-redhat/arch/x86/oprofile/op_model_athlon.c
@@ -34,12 +34,15 @@
 #define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
 #define CTRL_SET_ACTIVE(n) (n |= (1<<22))
 #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
-#define CTRL_CLEAR(x) (x &= (1<<21))
+#define CTRL_CLEAR(lo, hi) (lo &= (1<<21), hi = 0)
 #define CTRL_SET_ENABLE(val) (val |= 1<<20)
 #define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
 #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
-#define CTRL_SET_UM(val, m) (val |= (m << 8))
-#define CTRL_SET_EVENT(val, e) (val |= e)
+#define CTRL_SET_UM(val, m) (val |= ((m & 0xff) << 8))
+#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
+#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
+#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
+#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
 
 static unsigned long reset_value[NUM_COUNTERS];
 
@@ -72,7 +75,7 @@ static void athlon_setup_ctrs(struct op_
 	/* clear all counters */
 	for (i = 0 ; i < NUM_CONTROLS; ++i) {
 		CTRL_READ(low, high, msrs, i);
-		CTRL_CLEAR(low);
+		CTRL_CLEAR(low, high);
 		CTRL_WRITE(low, high, msrs, i);
 	}
 	
@@ -89,12 +92,15 @@ static void athlon_setup_ctrs(struct op_
 			CTR_WRITE(counter_config[i].count, msrs, i);
 
 			CTRL_READ(low, high, msrs, i);
-			CTRL_CLEAR(low);
+			CTRL_CLEAR(low, high);
 			CTRL_SET_ENABLE(low);
 			CTRL_SET_USR(low, counter_config[i].user);
 			CTRL_SET_KERN(low, counter_config[i].kernel);
 			CTRL_SET_UM(low, counter_config[i].unit_mask);
-			CTRL_SET_EVENT(low, counter_config[i].event);
+			CTRL_SET_EVENT_LOW(low, counter_config[i].event);
+			CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
+			CTRL_SET_HOST_ONLY(high, 0);
+			CTRL_SET_GUEST_ONLY(high, 0);
 			CTRL_WRITE(low, high, msrs, i);
 		} else {
 			reset_value[i] = 0;
diff -Naurp xen/arch/x86/oprofile/op_model_ppro.c xen-redhat/arch/x86/oprofile/op_model_ppro.c
--- xen/arch/x86/oprofile/op_model_ppro.c
+++ xen-redhat/arch/x86/oprofile/op_model_ppro.c
@@ -22,12 +22,24 @@
 #include "op_x86_model.h"
 #include "op_counter.h"
 
-#define NUM_COUNTERS 2
-#define NUM_CONTROLS 2
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_counters:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+static int num_counters = 2;
+static int counter_width = 32;
 
-#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
-#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0)
-#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
+#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) 
 
 #define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
 #define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
@@ -40,15 +52,16 @@
 #define CTRL_SET_UM(val, m) (val |= (m << 8))
 #define CTRL_SET_EVENT(val, e) (val |= e)
 
-static unsigned long reset_value[NUM_COUNTERS];
+static unsigned long reset_value[OP_MAX_COUNTER];
  
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
-	msrs->counters[0].addr = MSR_P6_PERFCTR0;
-	msrs->counters[1].addr = MSR_P6_PERFCTR1;
-	
-	msrs->controls[0].addr = MSR_P6_EVNTSEL0;
-	msrs->controls[1].addr = MSR_P6_EVNTSEL1;
+	int i;
+
+	for (i = 0; i < num_counters; i++)
+		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+	for (i = 0; i < num_counters; i++)
+		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
 }
 
 
@@ -56,25 +69,41 @@ static void ppro_setup_ctrs(struct op_ms
 {
 	unsigned int low, high;
 	int i;
+	
+	if (cpu_has_arch_perfmon) {
+		union cpuid10_eax eax;
+		eax.full = cpuid_eax(0xa);
+
+		/*
+		 * For Core2 (family 6, model 15), don't reset the
+		 * counter width:
+		 */
+		if (!(eax.split.version_id == 0 &&
+			current_cpu_data.x86 == 6 &&
+				current_cpu_data.x86_model == 15)) {
+
+			if (counter_width < eax.split.bit_width)
+				counter_width = eax.split.bit_width;
+		}
+	}
 
 	/* clear all counters */
-	for (i = 0 ; i < NUM_CONTROLS; ++i) {
+	for (i = 0 ; i < num_counters; ++i) {
 		CTRL_READ(low, high, msrs, i);
 		CTRL_CLEAR(low);
 		CTRL_WRITE(low, high, msrs, i);
 	}
 	
 	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		CTR_WRITE(1, msrs, i);
-	}
+	for (i = 0; i < num_counters; ++i)
+		wrmsrl(msrs->counters[i].addr, -1LL);
 
 	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
+	for (i = 0; i < num_counters; ++i) {
 		if (counter_config[i].enabled) {
 			reset_value[i] = counter_config[i].count;
 
-			CTR_WRITE(counter_config[i].count, msrs, i);
+			wrmsrl(msrs->counters[i].addr, -reset_value[i]);
 
 			CTRL_READ(low, high, msrs, i);
 			CTRL_CLEAR(low);
@@ -84,6 +113,8 @@ static void ppro_setup_ctrs(struct op_ms
 			CTRL_SET_UM(low, counter_config[i].unit_mask);
 			CTRL_SET_EVENT(low, counter_config[i].event);
 			CTRL_WRITE(low, high, msrs, i);
+		} else {
+			reset_value[i] = 0;
 		}
 	}
 }
@@ -97,17 +128,19 @@ static int ppro_check_ctrs(unsigned int 
                            struct op_msrs const * const msrs,
                            struct cpu_user_regs * const regs)
 {
-	unsigned int low, high;
+	u64 val;
 	int i;
 	int ovf = 0;
 	unsigned long eip = regs->eip;
 	int mode = xenoprofile_get_mode(current, regs);
 
-	for (i = 0 ; i < NUM_COUNTERS; ++i) {
-		CTR_READ(low, high, msrs, i);
-		if (CTR_OVERFLOWED(low)) {
+	for (i = 0 ; i < num_counters; ++i) {
+		if (!reset_value[i])
+			continue;
+		rdmsrl(msrs->counters[i].addr, val);
+		if (CTR_OVERFLOWED(val)) {
 			xenoprof_log_event(current, regs, eip, mode, i);
-			CTR_WRITE(reset_value[i], msrs, i);
+			wrmsrl(msrs->counters[i].addr, -reset_value[i]);
 			ovf = 1;
 		}
 	}
@@ -123,27 +156,78 @@ static int ppro_check_ctrs(unsigned int 
 static void ppro_start(struct op_msrs const * const msrs)
 {
 	unsigned int low,high;
-	CTRL_READ(low, high, msrs, 0);
-	CTRL_SET_ACTIVE(low);
-	CTRL_WRITE(low, high, msrs, 0);
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (reset_value[i]) {
+			CTRL_READ(low, high, msrs, i);
+			CTRL_SET_ACTIVE(low);
+			CTRL_WRITE(low, high, msrs, i);
+		}
+	}
 }
 
 
 static void ppro_stop(struct op_msrs const * const msrs)
 {
 	unsigned int low,high;
-	CTRL_READ(low, high, msrs, 0);
-	CTRL_SET_INACTIVE(low);
-	CTRL_WRITE(low, high, msrs, 0);
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[i])
+			continue;
+		CTRL_READ(low, high, msrs, i);
+		CTRL_SET_INACTIVE(low);
+		CTRL_WRITE(low, high, msrs, i);
+	}
 }
 
 
-struct op_x86_model_spec const op_ppro_spec = {
-	.num_counters = NUM_COUNTERS,
-	.num_controls = NUM_CONTROLS,
+/*
+ * Architectural performance monitoring.
+ *
+ * Newer Intel CPUs (Core1+) have support for architectural
+ * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
+ * The advantage of this is that it can be done without knowing about
+ * the specific CPU.
+ */
+void arch_perfmon_setup_counters(void)
+{
+	union cpuid10_eax eax;
+
+	eax.full = cpuid_eax(0xa);
+
+	/* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
+	if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
+	    current_cpu_data.x86_model == 15) {
+		eax.split.version_id = 2;
+		eax.split.num_counters = 2;
+		eax.split.bit_width = 40;
+	}
+
+	num_counters = min_t(u8, eax.split.num_counters, OP_MAX_COUNTER);
+
+	op_arch_perfmon_spec.num_counters = num_counters;
+	op_arch_perfmon_spec.num_controls = num_counters;
+	op_ppro_spec.num_counters = num_counters;
+	op_ppro_spec.num_controls = num_counters;
+}
+
+struct op_x86_model_spec op_ppro_spec = {
+	.num_counters = 2,
+	.num_controls = 2,
 	.fill_in_addresses = &ppro_fill_in_addresses,
 	.setup_ctrs = &ppro_setup_ctrs,
 	.check_ctrs = &ppro_check_ctrs,
 	.start = &ppro_start,
 	.stop = &ppro_stop
 };
+
+struct op_x86_model_spec op_arch_perfmon_spec = {
+	/* num_counters/num_controls filled in at runtime */
+	.fill_in_addresses = &ppro_fill_in_addresses,
+	.setup_ctrs = &ppro_setup_ctrs,
+	.check_ctrs = &ppro_check_ctrs,
+	.start = &ppro_start,
+	.stop = &ppro_stop,
+};
diff -Naurp xen/arch/x86/oprofile/op_x86_model.h xen-redhat/arch/x86/oprofile/op_x86_model.h
--- xen/arch/x86/oprofile/op_x86_model.h
+++ xen-redhat/arch/x86/oprofile/op_x86_model.h
@@ -32,8 +32,8 @@ struct pt_regs;
  * various x86 CPU model's perfctr support.
  */
 struct op_x86_model_spec {
-	unsigned int const num_counters;
-	unsigned int const num_controls;
+	unsigned int num_counters;
+	unsigned int num_controls;
 	void (*fill_in_addresses)(struct op_msrs * const msrs);
 	void (*setup_ctrs)(struct op_msrs const * const msrs);
 	int (*check_ctrs)(unsigned int const cpu, 
@@ -43,9 +43,11 @@ struct op_x86_model_spec {
 	void (*stop)(struct op_msrs const * const msrs);
 };
 
-extern struct op_x86_model_spec const op_ppro_spec;
+extern struct op_x86_model_spec op_ppro_spec;
+extern struct op_x86_model_spec op_arch_perfmon_spec;
 extern struct op_x86_model_spec const op_p4_spec;
 extern struct op_x86_model_spec const op_p4_ht2_spec;
 extern struct op_x86_model_spec const op_athlon_spec;
 
+void arch_perfmon_setup_counters(void);
 #endif /* OP_X86_MODEL_H */
diff -Naurp xen/arch/x86/pci.c xen-redhat/arch/x86/pci.c
--- xen/arch/x86/pci.c
+++ xen-redhat/arch/x86/pci.c
@@ -0,0 +1,117 @@
+/******************************************************************************
+ * pci.c
+ * 
+ * Architecture-dependent PCI access functions.
+ */
+
+#include <xen/spinlock.h>
+#include <asm/io.h>
+
+#define PCI_CONF_ADDRESS(bus, dev, func, reg) \
+    (0x80000000 | (bus << 16) | (dev << 11) | (func << 8) | (reg & ~3))
+
+static DEFINE_SPINLOCK(pci_config_lock);
+
+uint32_t pci_conf_read(uint32_t cf8, uint8_t offset, uint8_t bytes)
+{
+    unsigned long flags;
+    uint32_t value;
+
+    BUG_ON((offset + bytes) > 4);
+
+    spin_lock_irqsave(&pci_config_lock, flags);
+
+    outl(cf8, 0xcf8);
+
+    switch ( bytes )
+    {
+    case 1:
+        value = inb(0xcfc + offset);
+        break;
+    case 2:
+        value = inw(0xcfc + offset);
+        break;
+    case 4:
+        value = inl(0xcfc + offset);
+        break;
+    default:
+        value = 0;
+        BUG();
+    }
+
+    spin_unlock_irqrestore(&pci_config_lock, flags);
+
+    return value;
+}
+
+void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data)
+{
+    unsigned long flags;
+
+    BUG_ON((offset + bytes) > 4);
+
+    spin_lock_irqsave(&pci_config_lock, flags);
+
+    outl(cf8, 0xcf8);
+
+    switch ( bytes )
+    {
+    case 1:
+        outb((uint8_t)data, 0xcfc + offset);
+        break;
+    case 2:
+        outw((uint16_t)data, 0xcfc + offset);
+        break;
+    case 4:
+        outl(data, 0xcfc + offset);
+        break;
+    }
+
+    spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+uint8_t pci_conf_read8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1);
+}
+
+uint16_t pci_conf_read16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2);
+}
+
+uint32_t pci_conf_read32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4);
+}
+
+void pci_conf_write8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint8_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1, data);
+}
+
+void pci_conf_write16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint16_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2, data);
+}
+
+void pci_conf_write32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint32_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data);
+}
+
diff -Naurp xen/arch/x86/physdev.c xen-redhat/arch/x86/physdev.c
--- xen/arch/x86/physdev.c
+++ xen-redhat/arch/x86/physdev.c
@@ -1,4 +1,3 @@
-
 #include <xen/config.h>
 #include <xen/init.h>
 #include <xen/lib.h>
@@ -7,11 +6,13 @@
 #include <xen/irq.h>
 #include <xen/event.h>
 #include <xen/guest_access.h>
+#include <xen/iocap.h>
 #include <asm/current.h>
-#include <asm/smpboot.h>
+#include <asm/msi.h>
 #include <asm/hypercall.h>
 #include <public/xen.h>
 #include <public/physdev.h>
+#include <asm/p2m.h>
 
 #ifndef COMPAT
 typedef long ret_t;
@@ -24,10 +25,166 @@ int
 ioapic_guest_write(
     unsigned long physbase, unsigned int reg, u32 pval);
 
+static int physdev_map_pirq(struct physdev_map_pirq *map)
+{
+    struct domain *d;
+    int vector, pirq, ret = 0;
+    struct msi_info _msi;
+    void *map_data = NULL;
+
+    if ( !IS_PRIV(current->domain) )
+        return -EPERM;
+
+    if ( !map )
+        return -EINVAL;
+
+    if ( map->domid == DOMID_SELF )
+        d = rcu_lock_domain(current->domain);
+    else
+        d = rcu_lock_domain_by_id(map->domid);
+
+    if ( d == NULL )
+    {
+        ret = -ESRCH;
+        goto free_domain;
+    }
+
+    /* Verify or get vector. */
+    switch ( map->type )
+    {
+        case MAP_PIRQ_TYPE_GSI:
+            if ( map->index < 0 || map->index >= NR_IRQS )
+            {
+                dprintk(XENLOG_G_ERR, "dom%d: map invalid irq %d\n",
+                        d->domain_id, map->index);
+                ret = -EINVAL;
+                goto free_domain;
+            }
+            vector = IO_APIC_VECTOR(map->index);
+            if ( !vector )
+            {
+                dprintk(XENLOG_G_ERR, "dom%d: map irq with no vector %d\n",
+                        d->domain_id, vector);
+                ret = -EINVAL;
+                goto free_domain;
+            }
+            break;
+
+        case MAP_PIRQ_TYPE_MSI:
+            vector = map->index;
+            if ( vector == -1 )
+                vector = assign_irq_vector(AUTO_ASSIGN);
+
+            if ( vector < 0 || vector >= NR_VECTORS )
+            {
+                dprintk(XENLOG_G_ERR, "dom%d: map irq with wrong vector %d\n",
+                        d->domain_id, vector);
+                ret = -EINVAL;
+                goto free_domain;
+            }
+
+            _msi.bus = map->bus;
+            _msi.devfn = map->devfn;
+            _msi.entry_nr = map->entry_nr;
+            _msi.table_base = map->table_base;
+            _msi.vector = vector;
+            map_data = &_msi;
+            break;
+
+        default:
+            dprintk(XENLOG_G_ERR, "dom%d: wrong map_pirq type %x\n",
+                    d->domain_id, map->type);
+            ret = -EINVAL;
+            goto free_domain;
+    }
+
+    spin_lock(&pcidevs_lock);
+    /* Verify or get pirq. */
+    spin_lock(&d->event_lock);
+    pirq = domain_vector_to_irq(d, vector);
+    if ( map->pirq < 0 )
+    {
+        if ( pirq )
+        {
+            dprintk(XENLOG_G_ERR, "dom%d: %d:%d already mapped to %d\n",
+                    d->domain_id, map->index, map->pirq,
+                    pirq);
+            if ( pirq < 0 )
+            {
+                ret = -EBUSY;
+                goto done;
+            }
+        }
+        else
+        {
+            pirq = get_free_pirq(d, map->type, map->index);
+            if ( pirq < 0 )
+            {
+                dprintk(XENLOG_G_ERR, "dom%d: no free pirq\n", d->domain_id);
+                ret = pirq;
+                goto done;
+            }
+        }
+    }
+    else
+    {
+        if ( pirq && pirq != map->pirq )
+        {
+            dprintk(XENLOG_G_ERR, "dom%d: vector %d conflicts with irq %d\n",
+                    d->domain_id, map->index, map->pirq);
+            ret = -EEXIST;
+            goto done;
+        }
+        else
+            pirq = map->pirq;
+    }
+
+    ret = map_domain_pirq(d, pirq, vector, map->type, map_data);
+    if ( ret == 0 )
+        map->pirq = pirq;
+
+done:
+    spin_unlock(&d->event_lock);
+    spin_unlock(&pcidevs_lock);
+    if ( (ret != 0) && (map->type == MAP_PIRQ_TYPE_MSI) && (map->index == -1) )
+        free_irq_vector(vector);
+free_domain:
+    rcu_unlock_domain(d);
+    return ret;
+}
+
+static int physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
+{
+    struct domain *d;
+    int ret;
+
+    if ( !IS_PRIV(current->domain) )
+        return -EPERM;
+
+    if ( unmap->domid == DOMID_SELF )
+        d = rcu_lock_domain(current->domain);
+    else
+        d = rcu_lock_domain_by_id(unmap->domid);
+
+    if ( d == NULL )
+        return -ESRCH;
+
+    spin_lock(&pcidevs_lock);
+    spin_lock(&d->event_lock);
+    ret = unmap_domain_pirq(d, unmap->pirq);
+    spin_unlock(&d->event_lock);
+    spin_unlock(&pcidevs_lock);
+
+    rcu_unlock_domain(d);
+
+    return ret;
+}
+
 ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
 {
     int irq;
     ret_t ret;
+    struct vcpu *v = current;
 
     switch ( cmd )
     {
@@ -36,13 +193,13 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         ret = -EFAULT;
         if ( copy_from_guest(&eoi, arg, 1) != 0 )
             break;
-        ret = pirq_guest_eoi(current->domain, eoi.irq);
+        ret = pirq_guest_eoi(v->domain, eoi.irq);
         break;
     }
 
     /* Legacy since 0x00030202. */
     case PHYSDEVOP_IRQ_UNMASK_NOTIFY: {
-        ret = pirq_guest_unmask(current->domain);
+        ret = pirq_guest_unmask(v->domain);
         break;
     }
 
@@ -56,21 +213,67 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         if ( (irq < 0) || (irq >= NR_IRQS) )
             break;
         irq_status_query.flags = 0;
-        if ( pirq_acktype(irq) != 0 )
-            irq_status_query.flags |= XENIRQSTAT_needs_eoi;
-        if ( pirq_shared(irq) )
+        /*
+         * Even edge-triggered or message-based IRQs can need masking from
+         * time to time. If teh guest is not dynamically checking for this
+         * via the new pirq_eoi_map mechanism, it must conservatively always
+         * execute the EOI hypercall. In practice, this only really makes a
+         * difference for maskable MSI sources, and if those are supported
+         * then dom0 is probably modern anyway.
+         */
+        irq_status_query.flags |= XENIRQSTAT_needs_eoi;
+        if ( pirq_shared(v->domain, irq) )
             irq_status_query.flags |= XENIRQSTAT_shared;
         ret = copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0;
         break;
     }
 
+    case PHYSDEVOP_set_device_msixtbl: {
+        struct physdev_device_msixtbl tbl;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&tbl, arg, 1) != 0 )
+            break;
+
+        spin_lock(&pcidevs_lock);
+        ret = pci_set_device_msixtbl(tbl.bus, tbl.devfn, tbl.gtable);
+        spin_unlock(&pcidevs_lock);
+
+        break;
+    }
+
+    case PHYSDEVOP_map_pirq: {
+        struct physdev_map_pirq map;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&map, arg, 1) != 0 )
+            break;
+
+        ret = physdev_map_pirq(&map);
+
+        if ( copy_to_guest(arg, &map, 1) != 0 )
+            ret = -EFAULT;
+        break;
+    }
+
+    case PHYSDEVOP_unmap_pirq: {
+        struct physdev_unmap_pirq unmap;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&unmap, arg, 1) != 0 )
+            break;
+
+        ret = physdev_unmap_pirq(&unmap);
+        break;
+    }
+
     case PHYSDEVOP_apic_read: {
         struct physdev_apic apic;
         ret = -EFAULT;
         if ( copy_from_guest(&apic, arg, 1) != 0 )
             break;
         ret = -EPERM;
-        if ( !IS_PRIV(current->domain) )
+        if ( !IS_PRIV(v->domain) )
             break;
         ret = ioapic_guest_read(apic.apic_physbase, apic.reg, &apic.value);
         if ( copy_to_guest(arg, &apic, 1) != 0 )
@@ -84,7 +287,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         if ( copy_from_guest(&apic, arg, 1) != 0 )
             break;
         ret = -EPERM;
-        if ( !IS_PRIV(current->domain) )
+        if ( !IS_PRIV(v->domain) )
             break;
         ret = ioapic_guest_write(apic.apic_physbase, apic.reg, apic.value);
         break;
@@ -98,7 +301,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
             break;
 
         ret = -EPERM;
-        if ( !IS_PRIV(current->domain) )
+        if ( !IS_PRIV(v->domain) )
             break;
 
         irq = irq_op.irq;
@@ -107,7 +310,16 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
             break;
 
         irq_op.vector = assign_irq_vector(irq);
-        ret = copy_to_guest(arg, &irq_op, 1) ? -EFAULT : 0;
+
+        spin_lock(&pcidevs_lock);
+        spin_lock(&dom0->event_lock);
+        ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector,
+                              MAP_PIRQ_TYPE_GSI, NULL);
+        spin_unlock(&dom0->event_lock);
+        spin_unlock(&pcidevs_lock);
+
+        if ( copy_to_guest(arg, &irq_op, 1) != 0 )
+            ret = -EFAULT;
         break;
     }
 
@@ -120,7 +332,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         if ( set_iopl.iopl > 3 )
             break;
         ret = 0;
-        current->arch.iopl = set_iopl.iopl;
+        v->arch.iopl = set_iopl.iopl;
         break;
     }
 
@@ -135,11 +347,37 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
             break;
         ret = 0;
 #ifndef COMPAT
-        current->arch.iobmp       = set_iobitmap.bitmap;
+        v->arch.iobmp       = set_iobitmap.bitmap;
 #else
-        guest_from_compat_handle(current->arch.iobmp, set_iobitmap.bitmap);
+        guest_from_compat_handle(v->arch.iobmp, set_iobitmap.bitmap);
 #endif
-        current->arch.iobmp_limit = set_iobitmap.nr_ports;
+        v->arch.iobmp_limit = set_iobitmap.nr_ports;
+        break;
+    }
+
+    case PHYSDEVOP_manage_pci_add: {
+        struct physdev_manage_pci manage_pci;
+        ret = -EPERM;
+        if ( !IS_PRIV(current->domain) )
+            break;
+        ret = -EFAULT;
+        if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
+            break;
+
+        ret = pci_add_device(manage_pci.bus, manage_pci.devfn);
+        break;
+    }
+
+    case PHYSDEVOP_manage_pci_remove: {
+        struct physdev_manage_pci manage_pci;
+        ret = -EPERM;
+        if ( !IS_PRIV(current->domain) )
+            break;
+        ret = -EFAULT;
+        if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
+            break;
+
+        ret = pci_remove_device(manage_pci.bus, manage_pci.devfn);
         break;
     }
 
diff -Naurp xen/arch/x86/platform_hypercall.c xen-redhat/arch/x86/platform_hypercall.c
--- xen/arch/x86/platform_hypercall.c
+++ xen-redhat/arch/x86/platform_hypercall.c
@@ -34,10 +34,19 @@ DEFINE_SPINLOCK(xenpf_lock);
 # define copy_from_compat copy_from_guest
 # undef copy_to_compat
 # define copy_to_compat copy_to_guest
+# undef guest_from_compat_handle
+# define guest_from_compat_handle(x,y) ((x)=(y))
 #else
 extern spinlock_t xenpf_lock;
 #endif
 
+static DEFINE_PER_CPU(uint64_t, freq);
+
+static long cpu_frequency_change_helper(void *data)
+{
+    return cpu_frequency_change(this_cpu(freq));
+}
+
 ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
 {
     ret_t ret = 0;
@@ -247,11 +256,82 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
         }
         break;
 
+#if defined (CONFIG_X86_64) && !defined (COMPAT)
+    case XENPF_stratus_call:
+    {
+            extern int do_stratus(xenpf_stratus_call_t*);
+            ret = (ret_t)do_stratus(&(op->u.stratus_call));
+	    if (copy_to_guest(u_xenpf_op, op, 1))
+		ret = -EFAULT;
+    }
+    break;
+#endif
+
+    case XENPF_change_freq:
+        ret = -ENOSYS;
+        if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            break;
+        ret = -EINVAL;
+        if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) )
+            break;
+        per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq;
+        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu,
+                                        cpu_frequency_change_helper,
+                                        NULL);
+        break;
+
+    case XENPF_getidletime:
+    {
+        uint32_t cpu;
+        uint64_t idletime, now = NOW();
+        struct vcpu *v;
+        struct xenctl_cpumap ctlmap;
+        cpumask_t cpumap;
+        XEN_GUEST_HANDLE(uint8_t) cpumap_bitmap;
+        XEN_GUEST_HANDLE(uint64_t) idletimes;
+
+        ret = -ENOSYS;
+        if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            break;
+
+        ctlmap.nr_cpus  = op->u.getidletime.cpumap_nr_cpus;
+        guest_from_compat_handle(cpumap_bitmap,
+                                 op->u.getidletime.cpumap_bitmap);
+        ctlmap.bitmap.p = cpumap_bitmap.p; /* handle -> handle_64 conversion */
+        xenctl_cpumap_to_cpumask(&cpumap, &ctlmap);
+        guest_from_compat_handle(idletimes, op->u.getidletime.idletime);
+
+        for_each_cpu_mask ( cpu, cpumap )
+        {
+            if ( (v = idle_vcpu[cpu]) != NULL )
+            {
+                idletime = v->runstate.time[RUNSTATE_running];
+                if ( v->is_running )
+                    idletime += now - v->runstate.state_entry_time;
+            }
+            else
+            {
+                idletime = 0;
+                cpu_clear(cpu, cpumap);
+            }
+
+            ret = -EFAULT;
+            if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) )
+                goto out;
+        }
+
+        op->u.getidletime.now = now;
+        cpumask_to_xenctl_cpumap(&ctlmap, &cpumap);
+        ret = copy_to_guest(u_xenpf_op, op, 1) ? -EFAULT : 0;
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
     }
 
+ out:
     spin_unlock(&xenpf_lock);
 
     return ret;
diff -Naurp xen/arch/x86/setup.c xen-redhat/arch/x86/setup.c
--- xen/arch/x86/setup.c
+++ xen-redhat/arch/x86/setup.c
@@ -19,6 +19,7 @@
 #include <xen/numa.h>
 #include <xen/rcupdate.h>
 #include <xen/vga.h>
+#include <xen/dmi.h>
 #include <public/version.h>
 #ifdef CONFIG_COMPAT
 #include <compat/platform.h>
@@ -44,7 +45,6 @@
 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
 #endif
 
-extern void dmi_scan_machine(void);
 extern void generic_apic_probe(void);
 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 
@@ -109,6 +109,12 @@ extern void early_cpu_init(void);
 extern void vesa_init(void);
 extern void vesa_mtrr_init(void);
 
+DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
+#ifdef CONFIG_COMPAT
+DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
+    = boot_cpu_compat_gdt_table;
+#endif
+
 struct tss_struct init_tss[NR_CPUS];
 
 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
@@ -314,41 +320,6 @@ static void __init move_memory(
 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
 static struct e820map __initdata boot_e820;
 
-/* Reserve area (@s,@e) in the temporary bootstrap e820 map. */
-static int __init reserve_in_boot_e820(unsigned long s, unsigned long e)
-{
-    uint64_t rs, re;
-    int i;
-
-    for ( i = 0; i < boot_e820.nr_map; i++ )
-    {
-        /* Have we found the e820 region that includes the specified range? */
-        rs = boot_e820.map[i].addr;
-        re = rs + boot_e820.map[i].size;
-        if ( (s >= rs) && (e <= re) )
-            goto found;
-    }
-
-    return 0;
-
- found:
-    /* Start fragment. */
-    boot_e820.map[i].size = s - rs;
-
-    /* End fragment. */
-    if ( e < re )
-    {
-        memmove(&boot_e820.map[i+1], &boot_e820.map[i],
-                (boot_e820.nr_map-i) * sizeof(boot_e820.map[0]));
-        boot_e820.nr_map++;
-        i++;
-        boot_e820.map[i].addr = e;
-        boot_e820.map[i].size = re - e;
-    }
-
-    return 1;
-}
-
 struct boot_video_info {
     u8  orig_x;             /* 0x00 */
     u8  orig_y;             /* 0x01 */
@@ -411,6 +382,32 @@ static void __init parse_video_info(void
     }
 }
 
+void __init kexec_reserve_area(struct e820map *e820)
+{
+    unsigned long kdump_start = kexec_crash_area.start;
+    unsigned long kdump_size  = kexec_crash_area.size;
+    static int is_reserved = 0;
+
+    kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
+
+    if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
+        return;
+
+    is_reserved = 1;
+
+    if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
+    {
+        printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
+               "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
+        kexec_crash_area.start = kexec_crash_area.size = 0;
+    }
+    else
+    {
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n",
+               kdump_size >> 20, kdump_size >> 10, kdump_start);
+    }
+}
+
 void init_done(void)
 {
     extern char __init_begin[], __init_end[];
@@ -483,6 +480,9 @@ void __init __start_xen(unsigned long mb
     set_current((struct vcpu *)0xfffff000); /* debug sanity */
     idle_vcpu[0] = current;
     set_processor_id(0); /* needed early, for smp_processor_id() */
+    if ( cpu_has_efer )
+        rdmsrl(MSR_EFER, this_cpu(efer));
+    asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
 
     smp_prepare_boot_cpu();
 
@@ -556,14 +556,6 @@ void __init __start_xen(unsigned long mb
     if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
         EARLY_FAIL("Misaligned CPU0 stack.\n");
 
-    /*
-     * Since there are some stubs getting built on the stacks which use
-     * direct calls/jumps, the heap must be confined to the lower 2G so
-     * that those branches can reach their targets.
-     */
-    if ( opt_xenheap_megabytes > 2048 )
-        opt_xenheap_megabytes = 2048;
-
     if ( e820_raw_nr != 0 )
     {
         memmap_type = "Xen-e820";
@@ -582,7 +574,7 @@ void __init __start_xen(unsigned long mb
     else if ( mbi->flags & MBI_MEMMAP )
     {
         memmap_type = "Multiboot-e820";
-        while ( bytes < mbi->mmap_length )
+        while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
         {
             memory_map_t *map = __va(mbi->mmap_addr + bytes);
 
@@ -633,47 +625,31 @@ void __init __start_xen(unsigned long mb
         EARLY_FAIL("Bootloader provided no memory information.\n");
     }
 
-    /* Ensure that all E820 RAM regions are page-aligned and -sized. */
-    for ( i = 0; i < e820_raw_nr; i++ )
-    {
-        uint64_t s, e;
-
-        if ( e820_raw[i].type != E820_RAM )
-            continue;
-        s = PFN_UP(e820_raw[i].addr);
-        e = PFN_DOWN(e820_raw[i].addr + e820_raw[i].size);
-        e820_raw[i].size = 0; /* discarded later */
-        if ( s < e )
-        {
-            e820_raw[i].addr = s << PAGE_SHIFT;
-            e820_raw[i].size = (e - s) << PAGE_SHIFT;
-        }
-    }
-
     /* Sanitise the raw E820 map to produce a final clean version. */
     max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
 
+#ifdef CONFIG_X86_64
     /*
-     * Create a temporary copy of the E820 map. Truncate it to above 16MB
-     * as anything below that is already mapped and has a statically-allocated
-     * purpose.
+     * On x86/64 we are able to account for the allocation bitmap
+     * (allocated in common/page_alloc.c:init_boot_allocator()) stealing
+     * from the Xen heap. Here we make the Xen heap appropriately larger.
      */
+    opt_xenheap_megabytes += (max_page / 8) >> 20;
+#endif
+
+    /*
+     * Since there are some stubs getting built on the stacks which use
+     * direct calls/jumps, the heap must be confined to the lower 2G so
+     * that those branches can reach their targets.
+     */
+    if ( opt_xenheap_megabytes > 2048 )
+        opt_xenheap_megabytes = 2048;
+
+    /* Create a temporary copy of the E820 map. */
     memcpy(&boot_e820, &e820, sizeof(e820));
-    for ( i = 0; i < boot_e820.nr_map; i++ )
-    {
-        uint64_t s, e, min = 16 << 20; /* 16MB */
-        s = boot_e820.map[i].addr;
-        e = boot_e820.map[i].addr + boot_e820.map[i].size;
-        if ( s >= min )
-            continue;
-        if ( e > min )
-        {
-            boot_e820.map[i].addr = min;
-            boot_e820.map[i].size = e - min;
-        }
-        else
-            boot_e820.map[i].type = E820_RESERVED;
-    }
+
+    /* Early kexec reservation (explicit static start address). */
+    kexec_reserve_area(&boot_e820);
 
     /*
      * Iterate backwards over all superpage-aligned RAM regions.
@@ -693,9 +669,10 @@ void __init __start_xen(unsigned long mb
     {
         uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
 
-        /* Superpage-aligned chunks up to BOOTSTRAP_DIRECTMAP_END, please. */
+        /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
         s = (boot_e820.map[i].addr + mask) & ~mask;
         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
+        s = max_t(uint64_t, s, 16 << 20);
         e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
         if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
             continue;
@@ -796,71 +773,61 @@ void __init __start_xen(unsigned long mb
 
     if ( !initial_images_start )
         EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
-    reserve_in_boot_e820(initial_images_start, initial_images_end);
+    reserve_e820_ram(&boot_e820, initial_images_start, initial_images_end);
 
-    /*
-     * With modules (and Xen itself, on x86/64) relocated out of the way, we
-     * can now initialise the boot allocator with some memory.
-     */
+    /* Initialise Xen heap and boot heap. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
     xenheap_phys_end   = opt_xenheap_megabytes << 20;
 #if defined(CONFIG_X86_64)
     if ( !xen_phys_start )
         EARLY_FAIL("Not enough memory to relocate Xen.\n");
     xenheap_phys_end += xen_phys_start;
-    reserve_in_boot_e820(xen_phys_start,
-                         xen_phys_start + (opt_xenheap_megabytes<<20));
-    init_boot_pages(1<<20, 16<<20); /* Initial seed: 15MB */
-#else
-    init_boot_pages(xenheap_phys_end, 16<<20); /* Initial seed: 4MB */
+    reserve_e820_ram(&boot_e820, xen_phys_start,
+                     xen_phys_start + (opt_xenheap_megabytes<<20));
 #endif
 
-    if ( kexec_crash_area.size != 0 )
-    {
-        unsigned long kdump_start = kexec_crash_area.start;
-        unsigned long kdump_size  = kexec_crash_area.size;
-
-        kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
-
-        if ( !reserve_in_boot_e820(kdump_start, kdump_size) )
-        {
-            printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
-                   "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
-            kexec_crash_area.start = kexec_crash_area.size = 0;
-        }
-        else
-        {
-            printk("Kdump: %luMB (%lukB) at 0x%lx\n",
-                   kdump_size >> 20, kdump_size >> 10, kdump_start);
-        }
-    }
+    /* Late kexec reservation (dynamic start address). */
+    kexec_reserve_area(&boot_e820);
 
     /*
-     * With the boot allocator now seeded, we can walk every RAM region and
-     * map it in its entirety (on x86/64, at least) and notify it to the
+     * With the boot allocator now initialised, we can walk every RAM region
+     * and map it in its entirety (on x86/64, at least) and notify it to the
      * boot allocator.
      */
     for ( i = 0; i < boot_e820.nr_map; i++ )
     {
-        uint64_t s, e, map_e, mask = PAGE_SIZE - 1;
+        uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
 
         /* Only page alignment required now. */
         s = (boot_e820.map[i].addr + mask) & ~mask;
         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
+#if defined(CONFIG_X86_32)
+        s = max_t(uint64_t, s, xenheap_phys_end);
+#else
+        s = max_t(uint64_t, s, 1<<20);
+#endif
         if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
             continue;
 
-        /* Perform the mapping (truncated in 32-bit mode). */
+        /* Need to create mappings above 16MB. */
+        map_s = max_t(uint64_t, s, 16<<20);
         map_e = e;
-#if defined(CONFIG_X86_32)
+#if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
         map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
 #endif
-        if ( s < map_e )
+
+        /* Pass mapped memory to allocator /before/ creating new mappings. */
+        init_boot_pages(s, min_t(uint64_t, map_s, e));
+
+        /* Create new mappings /before/ passing memory to the allocator. */
+        if ( map_s < map_e )
             map_pages_to_xen(
-                (unsigned long)maddr_to_bootstrap_virt(s),
-                s >> PAGE_SHIFT, (map_e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
+                (unsigned long)maddr_to_bootstrap_virt(map_s),
+                map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
+                PAGE_HYPERVISOR);
 
-        init_boot_pages(s, e);
+        /* Pass remainder of this memory chunk to the allocator. */
+        init_boot_pages(map_s, e);
     }
 
     memguard_init();
@@ -988,6 +955,8 @@ void __init __start_xen(unsigned long mb
     if ( opt_nosmp )
         max_cpus = 0;
 
+    iommu_setup();
+
     smp_prepare_cpus(max_cpus);
 
     /*
@@ -1161,6 +1130,14 @@ void arch_get_xen_caps(xen_capabilities_
 #endif
 }
 
+int xen_in_range(paddr_t start, paddr_t end)
+{
+    start = max_t(paddr_t, start, xenheap_phys_start);
+    end = min_t(paddr_t, end, xenheap_phys_end);
+
+    return start < end;
+}
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/x86/smpboot.c xen-redhat/arch/x86/smpboot.c
--- xen/arch/x86/smpboot.c
+++ xen-redhat/arch/x86/smpboot.c
@@ -50,6 +50,7 @@
 #include <asm/div64.h>
 #include <asm/flushtlb.h>
 #include <asm/msr.h>
+#include <asm/mtrr.h>
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
@@ -489,6 +490,9 @@ void __devinit start_secondary(void *unu
 	set_processor_id(cpu);
 	set_current(idle_vcpu[cpu]);
         this_cpu(curr_vcpu) = idle_vcpu[cpu];
+	if ( cpu_has_efer )
+		rdmsrl(MSR_EFER, this_cpu(efer));
+	asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
 
 	percpu_traps_init();
 
@@ -531,6 +535,7 @@ void __devinit start_secondary(void *unu
 
 	/* We can take interrupts now: we're officially "up". */
 	local_irq_enable();
+	mtrr_ap_init();
 
         init_percpu_time();
 
@@ -543,40 +548,6 @@ extern struct {
 	unsigned short ss;
 } stack_start;
 
-#ifdef CONFIG_NUMA
-
-/* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
-				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-/* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_2_node);
-
-/* set up a mapping between cpu and node. */
-static inline void map_cpu_to_node(int cpu, int node)
-{
-	printk("Mapping cpu %d to node %d\n", cpu, node);
-	cpu_set(cpu, node_2_cpu_mask[node]);
-	cpu_2_node[cpu] = node;
-}
-
-/* undo a mapping between cpu and node. */
-static inline void unmap_cpu_to_node(int cpu)
-{
-	int node;
-
-	printk("Unmapping cpu %d from all nodes\n", cpu);
-	for (node = 0; node < MAX_NUMNODES; node ++)
-		cpu_clear(cpu, node_2_cpu_mask[node]);
-	cpu_2_node[cpu] = 0;
-}
-#else /* !CONFIG_NUMA */
-
-#define map_cpu_to_node(cpu, node)	({})
-#define unmap_cpu_to_node(cpu)	({})
-
-#endif /* CONFIG_NUMA */
-
 u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 static void map_cpu_to_logical_apicid(void)
@@ -585,13 +556,11 @@ static void map_cpu_to_logical_apicid(vo
 	int apicid = hard_smp_processor_id();
 
 	cpu_2_logical_apicid[cpu] = apicid;
-	map_cpu_to_node(cpu, apicid_to_node(apicid));
 }
 
 static void unmap_cpu_to_logical_apicid(int cpu)
 {
 	cpu_2_logical_apicid[cpu] = BAD_APICID;
-	unmap_cpu_to_node(cpu);
 }
 
 #if APIC_DEBUG
@@ -838,10 +807,15 @@ static int __devinit do_boot_cpu(int api
  */
 {
 	unsigned long boot_error;
+	unsigned int order;
 	int timeout;
 	unsigned long start_eip;
 	unsigned short nmi_high = 0, nmi_low = 0;
 	struct vcpu *v;
+	struct desc_struct *gdt;
+#ifdef __x86_64__
+        struct page_info *page;
+#endif
 
 	++cpucount;
 
@@ -861,6 +835,41 @@ static int __devinit do_boot_cpu(int api
 	/* Debug build: detect stack overflow by setting up a guard page. */
 	memguard_guard_stack(stack_start.esp);
 
+	gdt = per_cpu(gdt_table, cpu);
+	if (gdt == boot_cpu_gdt_table) {
+		order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+#ifdef __x86_64__
+#ifdef CONFIG_COMPAT
+		page = alloc_domheap_pages(NULL, order, 0);
+		per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
+		memcpy(gdt, boot_cpu_compat_gdt_table,
+		       NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+		gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+#endif
+		page = alloc_domheap_pages(NULL, order, 0);
+		per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
+#else
+		per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order);
+#endif
+		memcpy(gdt, boot_cpu_gdt_table,
+		       NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+		BUILD_BUG_ON(NR_CPUS > 0x10000);
+		gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+	}
+
+#ifdef __i386__
+	if (!per_cpu(doublefault_tss, cpu)) {
+		per_cpu(doublefault_tss, cpu) = alloc_xenheap_page();
+		memset(per_cpu(doublefault_tss, cpu), 0, PAGE_SIZE);
+	}
+#endif
+
+	if (!idt_tables[cpu]) {
+		idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
+		memcpy(idt_tables[cpu], idt_table,
+		       IDT_ENTRIES*sizeof(idt_entry_t));
+	}
+
 	/*
 	 * This grunge runs the startup process for
 	 * the targeted processor.
@@ -1121,6 +1130,7 @@ void __init smp_prepare_cpus(unsigned in
 	smp_commenced_mask = cpumask_of_cpu(0);
 	cpu_callin_map = cpumask_of_cpu(0);
 	mb();
+	mtrr_aps_sync_begin();
 	smp_boot_cpus(max_cpus);
 }
 
@@ -1158,6 +1168,7 @@ void __init smp_cpus_done(unsigned int m
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
+	mtrr_aps_sync_end();
 #ifndef CONFIG_HOTPLUG_CPU
 	/*
 	 * Disable executability of the SMP trampoline:
diff -Naurp xen/arch/x86/smp.c xen-redhat/arch/x86/smp.c
--- xen/arch/x86/smp.c
+++ xen-redhat/arch/x86/smp.c
@@ -86,6 +86,12 @@ static inline void check_IPI_mask(cpumas
     ASSERT(!cpus_empty(cpumask));
 }
 
+void apic_wait_icr_idle(void)
+{
+	while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY )
+		cpu_relax();
+}
+
 void send_IPI_mask_flat(cpumask_t cpumask, int vector)
 {
     unsigned long mask = cpus_addr(cpumask)[0];
diff -Naurp xen/arch/x86/srat.c xen-redhat/arch/x86/srat.c
--- xen/arch/x86/srat.c
+++ xen-redhat/arch/x86/srat.c
@@ -17,6 +17,7 @@
 #include <xen/nodemask.h>
 #include <xen/acpi.h>
 #include <xen/numa.h>
+#include <asm/e820.h>
 #include <asm/page.h>
 
 static struct acpi_table_slit *acpi_slit;
@@ -217,23 +218,39 @@ acpi_numa_memory_affinity_init(struct ac
 static int nodes_cover_memory(void)
 {
 	int i;
-	u64 pxmram, e820ram;
 
-	pxmram = 0;
-	for_each_node_mask(i, nodes_parsed) {
-		u64 s = nodes[i].start >> PAGE_SHIFT;
-		u64 e = nodes[i].end >> PAGE_SHIFT;
-		pxmram += e - s;
-	}
+	for (i = 0; i < e820.nr_map; i++) {
+		int j, found;
+		unsigned long long start, end;
 
-	e820ram = max_page;
-	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
-	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
-		printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %"
-			PRIu64"MB e820 RAM. Not used.\n",
-			(pxmram << PAGE_SHIFT) >> 20,
-			(e820ram << PAGE_SHIFT) >> 20);
-		return 0;
+		if (e820.map[i].type != E820_RAM) {
+			continue;
+		}
+
+		start = e820.map[i].addr;
+		end = e820.map[i].addr + e820.map[i].size - 1;
+
+		do {
+			found = 0;
+			for_each_node_mask(j, nodes_parsed)
+				if (start < nodes[j].end
+				    && end > nodes[j].start) {
+					if (start >= nodes[j].start) {
+						start = nodes[j].end;
+						found = 1;
+					}
+					if (end <= nodes[j].end) {
+						end = nodes[j].start;
+						found = 1;
+					}
+				}
+		} while (found && start < end);
+
+		if (start < end) {
+			printk(KERN_ERR "SRAT: No PXM for e820 range: "
+				"%016Lx - %016Lx\n", start, end);
+			return 0;
+		}
 	}
 	return 1;
 }
diff -Naurp xen/arch/x86/sysctl.c xen-redhat/arch/x86/sysctl.c
--- xen/arch/x86/sysctl.c
+++ xen-redhat/arch/x86/sysctl.c
@@ -23,6 +23,10 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
+#include <asm/numa.h>
+#include <xen/nodemask.h>
+
+#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 
 long arch_do_sysctl(
     struct xen_sysctl *sysctl, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
@@ -34,25 +38,57 @@ long arch_do_sysctl(
 
     case XEN_SYSCTL_physinfo:
     {
+        uint32_t i, max_array_ent;
+
         xen_sysctl_physinfo_t *pi = &sysctl->u.physinfo;
 
         pi->threads_per_core =
             cpus_weight(cpu_sibling_map[0]);
         pi->cores_per_socket =
             cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
-        pi->sockets_per_node = 
-            num_online_cpus() / cpus_weight(cpu_core_map[0]);
+        pi->nr_nodes = num_online_nodes();
+
+        /*
+         * RHEL5 ABI compat:
+         * Newer userspace expects 'sockets_per_node' to actually
+         * contain 'nr_cpus' data.
+         */
+        if (sysctl->interface_version > XEN_SYSCTL_INTERFACE_VERSION)
+            pi->sockets_per_node = (u32)num_online_cpus();
+        else
+            pi->sockets_per_node = num_online_cpus() /
+                (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core);
 
-        pi->nr_nodes         = 1;
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
-        pi->scrub_pages      = avail_scrub_pages();
+        pi->scrub_pages      = 0;
         pi->cpu_khz          = cpu_khz;
         memset(pi->hw_cap, 0, sizeof(pi->hw_cap));
         memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
-        ret = 0;
-        if ( copy_to_guest(u_sysctl, sysctl, 1) )
-            ret = -EFAULT;
+
+        max_array_ent = pi->max_cpu_id;
+        pi->max_cpu_id = last_cpu(cpu_online_map);
+        max_array_ent = min_t(uint32_t, max_array_ent, pi->max_cpu_id);
+
+        ret = -EFAULT;
+        /*
+         * RHEL5 ABI compat:
+         * Only fill in extended NUMA info if a newer userspace
+         * is talking to us
+         */
+        if (sysctl->interface_version > XEN_SYSCTL_INTERFACE_VERSION)
+        {
+            if ( !guest_handle_is_null(pi->cpu_to_node) )
+            {
+                for ( i = 0; i <= max_array_ent; i++ )
+                {
+                    uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u;
+                    if ( copy_to_guest_offset(pi->cpu_to_node, i, &node, 1) )
+                        break;
+                }
+            }
+        }
+        ret = copy_to_guest(u_sysctl, sysctl, 1) ? -EFAULT : 0;
     }
     break;
     
diff -Naurp xen/arch/x86/time.c xen-redhat/arch/x86/time.c
--- xen/arch/x86/time.c
+++ xen-redhat/arch/x86/time.c
@@ -177,7 +177,6 @@ static u64 init_pit_and_calibrate_tsc(vo
     unsigned long count;
 
     /* Set PIT channel 0 to HZ Hz. */
-#define CLOCK_TICK_RATE 1193180 /* crystal freq (Hz) */
 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
     outb_p(0x34, PIT_MODE);        /* binary, mode 2, LSB/MSB, ch 0 */
     outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
@@ -554,8 +553,7 @@ static void init_platform_timer(void)
 
     if ( (rc <= 0) &&
          !init_cyclone(pts) &&
-         !init_hpet(pts) &&
-         !init_pmtimer(pts) )
+         !init_hpet(pts) )
         init_pit(pts);
 
     plt_mask = (u32)~0u >> (32 - pts->counter_bits);
@@ -725,6 +723,37 @@ void update_domain_wallclock_time(struct
     spin_unlock(&wc_lock);
 }
 
+int cpu_frequency_change(u64 freq)
+{
+    struct cpu_time *t = &this_cpu(cpu_time);
+    u64 curr_tsc;
+
+    /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
+    if ( freq < 1000000u )
+    {
+        gdprintk(XENLOG_WARNING, "Rejecting CPU frequency change "
+                 "to %"PRIu64" Hz.\n", freq);
+        return -EINVAL;
+    }
+
+    local_irq_disable();
+    rdtscll(curr_tsc);
+    t->local_tsc_stamp = curr_tsc;
+    t->stime_master_stamp = read_platform_stime();
+    /* TSC-extrapolated time may be bogus after frequency change. */
+    /*t->stime_local_stamp = get_s_time();*/
+    t->stime_local_stamp = t->stime_master_stamp;
+    set_time_scale(&t->tsc_scale, freq);
+    local_irq_enable();
+
+    /* A full epoch should pass before we check for deviation. */
+    set_timer(&t->calibration_timer, NOW() + EPOCH);
+    if ( smp_processor_id() == 0 )
+        platform_time_calibration();
+
+    return 0;
+}
+
 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
 void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base)
 {
@@ -869,12 +898,14 @@ static void local_time_calibration(void 
            error_factor, calibration_mul_frac, tsc_shift);
 #endif
 
-    /* Record new timestamp information. */
+    /* Record new timestamp information, atomically w.r.t. interrupts. */
+    local_irq_disable();
     t->tsc_scale.mul_frac = calibration_mul_frac;
     t->tsc_scale.shift    = tsc_shift;
     t->local_tsc_stamp    = curr_tsc;
     t->stime_local_stamp  = curr_local_stime;
     t->stime_master_stamp = curr_master_stime;
+    local_irq_enable();
 
     update_vcpu_system_time(current);
 
@@ -974,6 +1005,50 @@ int time_resume(void)
     return 0;
 }
 
+int dom0_pit_access(struct ioreq *ioreq)
+{
+    /* Is Xen using Channel 2? Then disallow direct dom0 access. */
+    if ( plt_src.read_counter == read_pit_count )
+        return 0;
+
+    switch ( ioreq->addr )
+    {
+    case PIT_CH2:
+        if ( ioreq->dir == IOREQ_READ )
+            ioreq->data = inb(PIT_CH2);
+        else
+            outb(ioreq->data, PIT_CH2);
+        return 1;
+
+    case PIT_MODE:
+        if ( ioreq->dir == IOREQ_READ )
+            return 0; /* urk! */
+        switch ( ioreq->data & 0xc0 )
+        {
+        case 0xc0: /* Read Back */
+            if ( ioreq->data & 0x08 )    /* Select Channel 2? */
+                outb(ioreq->data & 0xf8, PIT_MODE);
+            if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
+                return 1; /* no - we're done */
+            /* Filter Channel 2 and reserved bit 0. */
+            ioreq->data &= ~0x09;
+            return 0; /* emulate ch0/1 readback */
+        case 0x80: /* Select Counter 2 */
+            outb(ioreq->data, PIT_MODE);
+            return 1;
+        }
+
+    case 0x61:
+        if ( ioreq->dir == IOREQ_READ )
+            ioreq->data = inb(0x61);
+        else
+            outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
+        return 1;
+    }
+
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/arch/x86/traps.c xen-redhat/arch/x86/traps.c
--- xen/arch/x86/traps.c
+++ xen-redhat/arch/x86/traps.c
@@ -107,6 +107,8 @@ DECLARE_TRAP_HANDLER(spurious_interrupt_
 
 long do_set_debugreg(int reg, unsigned long value);
 unsigned long do_get_debugreg(int reg);
+void (*ioemul_handle_quirk)(
+    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
 
 static int debug_stack_lines = 20;
 integer_param("debug_stack_lines", debug_stack_lines);
@@ -602,28 +604,76 @@ static int emulate_forced_invalid_op(str
         : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
         : "0" (a), "1" (b), "2" (c), "3" (d) );
 
-    if ( regs->eax == 1 )
+    if ( (regs->eax & 0x7fffffff) == 1 )
     {
         /* Modify Feature Information. */
         clear_bit(X86_FEATURE_VME, &d);
+        if ( !cpu_has_apic )
+            clear_bit(X86_FEATURE_APIC, &d);
         clear_bit(X86_FEATURE_DE,  &d);
         clear_bit(X86_FEATURE_PSE, &d);
         clear_bit(X86_FEATURE_PGE, &d);
-        if ( !supervisor_mode_kernel )
-            clear_bit(X86_FEATURE_SEP, &d);
+        clear_bit(X86_FEATURE_MCE, &d);
+        clear_bit(X86_FEATURE_MCA, &d);
         if ( !IS_PRIV(current->domain) )
             clear_bit(X86_FEATURE_MTRR, &d);
+        clear_bit(X86_FEATURE_PSE36, &d);
     }
-    else if ( regs->eax == 0x80000001 )
+    switch ( (uint32_t)regs->eax )
     {
+    case 1:
+      /* Modify Feature Information. */
+        if ( !supervisor_mode_kernel )
+            clear_bit(X86_FEATURE_SEP, &d);
+        clear_bit(X86_FEATURE_DS, &d);
+        clear_bit(X86_FEATURE_ACC, &d);
+        clear_bit(X86_FEATURE_PBE, &d);
+
+        clear_bit(X86_FEATURE_DTES64 % 32, &c);
+        clear_bit(X86_FEATURE_MWAIT % 32, &c);
+        clear_bit(X86_FEATURE_DSCPL % 32, &c);
+        clear_bit(X86_FEATURE_SMXE % 32, &c);
+        clear_bit(X86_FEATURE_TM2 % 32, &c);
+        if ( is_pv_32bit_vcpu(current) )
+            clear_bit(X86_FEATURE_CX16 % 32, &c);
+        clear_bit(X86_FEATURE_XTPR % 32, &c);
+        clear_bit(X86_FEATURE_PDCM % 32, &c);
+        clear_bit(X86_FEATURE_DCA % 32, &c);
+        clear_bit(X86_FEATURE_XSAVE % 32, &c);
+        set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
+        break;
+    case 0x80000001:
         /* Modify Feature Information. */
         if ( is_pv_32bit_vcpu(current) )
+        {
             clear_bit(X86_FEATURE_SYSCALL % 32, &d);
+            clear_bit(X86_FEATURE_LM % 32, &d);
+            clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
+        }
+        clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
         clear_bit(X86_FEATURE_RDTSCP % 32, &d);
-    }
-    else
-    {
+
+        clear_bit(X86_FEATURE_OSVW % 32, &c);
+        clear_bit(X86_FEATURE_IBS % 32, &c);
+        clear_bit(X86_FEATURE_SKINIT % 32, &c);
+        clear_bit(X86_FEATURE_WDT % 32, &c);
+        clear_bit(X86_FEATURE_LWP % 32, &c);
+        clear_bit(X86_FEATURE_NODEID_MSR % 32, &c);
+        clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
+        clear_bit(X86_FEATURE_PERFCTR_CORE % 32, &c);
+        clear_bit(X86_FEATURE_PERFCTR_NB % 32, &c);
+        break;
+    case 5: /* MONITOR/MWAIT */
+    case 0xa: /* Architectural Performance Monitor Features */
+    case 0x8000000a: /* SVM revision and features */
+    case 0x8000001b: /* Instruction Based Sampling */
+    case 0x8000001c: /* Light Weight Profiling */
+    case 0x8000001e: /* Extended topology reporting */
+        a = b = c = d = 0;
+        break;
+    default:
         (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
+        break;
     }
 
     regs->eax = a;
@@ -641,6 +691,7 @@ asmlinkage int do_invalid_op(struct cpu_
     struct bug_frame bug;
     struct bug_frame_str bug_str;
     char *filename, *predicate, *eip = (char *)regs->eip;
+    unsigned long fixup;
     int rc, id, lineno;
 
     DEBUGGER_trap_entry(TRAP_invalid_op, regs);
@@ -711,6 +762,11 @@ asmlinkage int do_invalid_op(struct cpu_
           predicate, filename, lineno);
 
  die:
+    if ( (fixup = search_exception_table(regs->eip)) != 0 )
+    {
+        regs->eip = fixup;
+        return 0;
+    }
     DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
     show_execution_state(regs);
     panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
@@ -738,6 +794,15 @@ asmlinkage void do_machine_check(struct 
     machine_check_vector(regs, regs->error_code);
 }
 
+static void reserved_bit_page_fault(
+    unsigned long addr, struct cpu_user_regs *regs)
+{
+    printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
+           current->domain->domain_id, current->vcpu_id, regs->error_code);
+    show_page_walk(addr);
+    show_execution_state(regs);
+}
+
 void propagate_page_fault(unsigned long addr, u16 error_code)
 {
     struct trap_info *ti;
@@ -761,10 +826,13 @@ void propagate_page_fault(unsigned long 
         tb->flags |= TBF_INTERRUPT;
     if ( unlikely(null_trap_bounce(v, tb)) )
     {
-        printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
+        printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
                v->domain->domain_id, v->vcpu_id, error_code);
         show_page_walk(addr);
     }
+
+    if ( unlikely(error_code & PFEC_reserved_bit) )
+        reserved_bit_page_fault(addr, guest_cpu_user_regs());
 }
 
 static int handle_gdt_ldt_mapping_fault(
@@ -940,7 +1008,8 @@ static int fixup_page_fault(unsigned lon
     {
         if ( paging_mode_external(d) && guest_mode(regs) )
             return paging_fault(addr, regs);
-        if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
+        if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
+             (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
             return handle_gdt_ldt_mapping_fault(
                 addr - GDT_LDT_VIRT_START, regs);
         return 0;
@@ -950,7 +1019,8 @@ static int fixup_page_fault(unsigned lon
          guest_kernel_mode(v, regs) &&
          /* Do not check if access-protection fault since the page may 
             legitimately be not present in shadow page tables */
-         ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
+         ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
+          PFEC_write_access) &&
          ptwr_do_page_fault(v, addr, regs) )
         return EXCRET_fault_fixed;
 
@@ -990,6 +1060,8 @@ asmlinkage int do_page_fault(struct cpu_
         if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
         {
             perfc_incr(copy_user_faults);
+            if ( unlikely(regs->error_code & PFEC_reserved_bit) )
+                reserved_bit_page_fault(addr, regs);
             regs->eip = fixup;
             return 0;
         }
@@ -1117,7 +1189,7 @@ static int read_descriptor(unsigned int 
 }
 
 /* Has the guest requested sufficient permission for this I/O access? */
-static inline int guest_io_okay(
+static int guest_io_okay(
     unsigned int port, unsigned int bytes,
     struct vcpu *v, struct cpu_user_regs *regs)
 {
@@ -1159,19 +1231,126 @@ static inline int guest_io_okay(
 }
 
 /* Has the administrator granted sufficient permission for this I/O access? */
-static inline int admin_io_okay(
+static int admin_io_okay(
     unsigned int port, unsigned int bytes,
     struct vcpu *v, struct cpu_user_regs *regs)
 {
+    /*
+     * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
+     * We never permit direct access to that register.
+     */
+    if ( (port == 0xcf8) && (bytes == 4) )
+        return 0;
+
     return ioports_access_permitted(v->domain, port, port + bytes - 1);
 }
 
-#define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
-#define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
-#define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
-#define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
-#define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
-#define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
+static uint32_t guest_io_read(
+    unsigned int port, unsigned int bytes,
+    struct vcpu *v, struct cpu_user_regs *regs)
+{
+    extern uint32_t pci_conf_read(
+        uint32_t cf8, uint8_t offset, uint8_t bytes);
+
+    uint32_t data = 0;
+    unsigned int shift = 0;
+
+    if ( admin_io_okay(port, bytes, v, regs) )
+    {
+        switch ( bytes )
+        {
+        case 1: return inb(port);
+        case 2: return inw(port);
+        case 4: return inl(port);
+        }
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+        uint32_t sub_data = 0xff;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            sub_data = pv_pit_handler(port, 0, 0);
+        }
+        else if ( (port == 0xcf8) && (bytes == 4) )
+        {
+            size = 4;
+            sub_data = v->domain->arch.pci_cf8;
+        }
+        else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
+        }
+
+        if ( size == 4 )
+            return sub_data;
+
+        data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
+        shift += size * 8;
+        port += size;
+        bytes -= size;
+    }
+
+    return data;
+}
+
+static void guest_io_write(
+    unsigned int port, unsigned int bytes, uint32_t data,
+    struct vcpu *v, struct cpu_user_regs *regs)
+{
+    extern void pci_conf_write(
+        uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
+
+    if ( admin_io_okay(port, bytes, v, regs) )
+    {
+        switch ( bytes ) {
+        case 1:
+            outb((uint8_t)data, port);
+            break;
+        case 2:
+            outw((uint16_t)data, port);
+            break;
+        case 4:
+            outl(data, port);
+            break;
+        }
+        return;
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            pv_pit_handler(port, (uint8_t)data, 1);
+        }
+        else if ( (port == 0xcf8) && (bytes == 4) )
+        {
+            size = 4;
+            v->domain->arch.pci_cf8 = data;
+        }
+        else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
+        }
+
+        if ( size == 4 )
+            return;
+
+        port += size;
+        bytes -= size;
+        data >>= size * 8;
+    }
+}
 
 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
 void host_to_guest_gpr_switch(struct cpu_user_regs *)
@@ -1198,6 +1377,12 @@ unsigned long guest_to_host_gpr_switch(u
 # define read_sreg(regs, sr) read_segment_register(sr)
 #endif
 
+static int is_cpufreq_controller(struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            (d->domain_id == 0));
+}
+
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -1217,7 +1402,7 @@ static int emulate_privileged_op(struct 
                            ? (*(u32 *)&regs->reg = (val)) \
                            : (*(u16 *)&regs->reg = (val)))
     unsigned long code_base, code_limit;
-    char io_emul_stub[16];
+    char io_emul_stub[32];
     void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
     u32 l, h, eax, edx;
 
@@ -1286,7 +1471,7 @@ static int emulate_privileged_op(struct 
 
     /* REX prefix. */
     if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
+        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
     modrm_reg = (rex & 4) << 1;  /* REX.R */
     /* REX.X does not need to be decoded. */
     modrm_rm  = (rex & 1) << 3;  /* REX.B */
@@ -1315,7 +1500,8 @@ static int emulate_privileged_op(struct 
         {
             if ( !read_descriptor(data_sel, v, regs,
                                   &data_base, &data_limit, &ar,
-                                  _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
+                                  _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
+                                  _SEGMENT_P) )
                 goto fail;
             if ( !(ar & _SEGMENT_S) ||
                  !(ar & _SEGMENT_P) ||
@@ -1354,73 +1540,47 @@ static int emulate_privileged_op(struct 
         }
 #endif
 
+        port = (u16)regs->edx;
+
     continue_io_string:
         switch ( opcode )
         {
         case 0x6c: /* INSB */
             op_bytes = 1;
         case 0x6d: /* INSW/INSL */
-            if ( data_limit < op_bytes - 1 ||
-                 rd_ad(edi) > data_limit - (op_bytes - 1) ||
-                 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
+            if ( (data_limit < (op_bytes - 1)) ||
+                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
+                 !guest_io_okay(port, op_bytes, v, regs) )
                 goto fail;
-            port = (u16)regs->edx;
-            switch ( op_bytes )
-            {
-            case 1:
-                /* emulate PIT counter 2 */
-                data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) : 
-                       ((port == 0x42 || port == 0x43 || port == 0x61) ?
-                       pv_pit_handler(port, 0, 0) : ~0)); 
-                break;
-            case 2:
-                data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
-                break;
-            case 4:
-                data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
-                break;
-            }
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
+            data = guest_io_read(port, op_bytes, v, regs);
+            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
+                                    &data, op_bytes)) != 0 )
             {
                 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
                                      PFEC_write_access);
                 return EXCRET_fault_fixed;
             }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
+            wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
+                                         ? -op_bytes : op_bytes));
             break;
 
         case 0x6e: /* OUTSB */
             op_bytes = 1;
         case 0x6f: /* OUTSW/OUTSL */
-            if ( data_limit < op_bytes - 1 ||
-                 rd_ad(esi) > data_limit - (op_bytes - 1) ||
-                 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
+           if ( (data_limit < (op_bytes - 1)) ||
+                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
+                  !guest_io_okay(port, op_bytes, v, regs) )
                 goto fail;
-            rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
-            if ( rc != 0 )
+            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
+                                      op_bytes)) != 0 )
             {
-                propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
+                propagate_page_fault(data_base + rd_ad(esi)
+                                     + op_bytes - rc, 0);
                 return EXCRET_fault_fixed;
             }
-            port = (u16)regs->edx;
-            switch ( op_bytes )
-            {
-            case 1:
-                if ( guest_outb_okay(port, v, regs) )
-                    outb((u8)data, port);
-                else if ( port == 0x42 || port == 0x43 || port == 0x61 )
-                    pv_pit_handler(port, data, 1);
-                break;
-            case 2:
-                if ( guest_outw_okay(port, v, regs) )
-                    outw((u16)data, port);
-                break;
-            case 4:
-                if ( guest_outl_okay(port, v, regs) )
-                    outl((u32)data, port);
-                break;
-            }
-            wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
+            guest_io_write(port, op_bytes, data, v, regs);
+            wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
+                                         ? -op_bytes : op_bytes));
             break;
         }
 
@@ -1468,6 +1628,9 @@ static int emulate_privileged_op(struct 
     /* Handy function-typed pointer to the stub. */
     io_emul = (void *)io_emul_stub;
 
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
+
     /* I/O Port and Interrupt Flag instructions. */
     switch ( opcode )
     {
@@ -1479,31 +1642,17 @@ static int emulate_privileged_op(struct 
     exec_in:
         if ( !guest_io_okay(port, op_bytes, v, regs) )
             goto fail;
-        switch ( op_bytes )
+        if ( admin_io_okay(port, op_bytes, v, regs) )
         {
-        case 1:
-            if ( guest_inb_okay(port, v, regs) )
-                io_emul(regs);
-            else if ( port == 0x42 || port == 0x43 || port == 0x61 )
-            {
-                regs->eax &= ~0xffUL;
-                regs->eax |= pv_pit_handler(port, 0, 0);
-            } 
-            else
-                regs->eax |= (u8)~0;
-            break;
-        case 2:
-            if ( guest_inw_okay(port, v, regs) )
-                io_emul(regs);
-            else
-                regs->eax |= (u16)~0;
-            break;
-        case 4:
-            if ( guest_inl_okay(port, v, regs) )
-                io_emul(regs);
+            io_emul(regs);
+        }
+        else
+        {
+            if ( op_bytes == 4 )
+                regs->eax = 0;
             else
-                regs->eax = (u32)~0;
-            break;
+                regs->eax &= ~((1u << (op_bytes * 8)) - 1);
+            regs->eax |= guest_io_read(port, op_bytes, v, regs);
         }
         goto done;
 
@@ -1521,22 +1670,11 @@ static int emulate_privileged_op(struct 
     exec_out:
         if ( !guest_io_okay(port, op_bytes, v, regs) )
             goto fail;
-        switch ( op_bytes )
+        if ( admin_io_okay(port, op_bytes, v, regs) )
+            io_emul(regs);
+        else
         {
-        case 1:
-            if ( guest_outb_okay(port, v, regs) )
-                io_emul(regs);
-            else if ( port == 0x42 || port == 0x43 || port == 0x61 )
-                pv_pit_handler(port, regs->eax, 1);
-            break;
-        case 2:
-            if ( guest_outw_okay(port, v, regs) )
-                io_emul(regs);
-            break;
-        case 4:
-            if ( guest_outl_okay(port, v, regs) )
-                io_emul(regs);
-            break;
+            guest_io_write(port, op_bytes, regs->eax, v, regs);
         }
         goto done;
 
@@ -1674,10 +1812,9 @@ static int emulate_privileged_op(struct 
             break;
 
         case 4: /* Write CR4 */
-            if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
-                gdprintk(XENLOG_WARNING,
-                         "Attempt to change CR4 flags %08lx -> %08lx\n",
-                         read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE), *reg);
+            v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
+            write_cr4(pv_guest_cr4_to_real_cr4(
+                v->arch.guest_context.ctrlreg[4]));
             break;
 
         default:
@@ -1698,7 +1835,7 @@ static int emulate_privileged_op(struct 
         eax = regs->eax;
         edx = regs->edx;
         res = ((u64)edx << 32) | eax;
-        switch ( regs->ecx )
+        switch ( (u32)regs->ecx )
         {
 #ifdef CONFIG_X86_64
         case MSR_FS_BASE:
@@ -1723,10 +1860,49 @@ static int emulate_privileged_op(struct 
             v->arch.guest_context.gs_base_user = res;
             break;
 #endif
+        case MSR_K8_FIDVID_STATUS:
+        case MSR_K8_FIDVID_CTL:
+        case MSR_K8_PSTATE_LIMIT:
+        case MSR_K8_PSTATE_CTRL:
+        case MSR_K8_PSTATE_STATUS:
+        case MSR_K8_PSTATE0:
+        case MSR_K8_PSTATE1:
+        case MSR_K8_PSTATE2:
+        case MSR_K8_PSTATE3:
+        case MSR_K8_PSTATE4:
+        case MSR_K8_PSTATE5:
+        case MSR_K8_PSTATE6:
+        case MSR_K8_PSTATE7:
+        case MSR_K8_HWCR:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+                goto fail;
+            if ( !is_cpufreq_controller(v->domain) )
+                break;
+            if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
+                goto fail;
+            break;
+        case MSR_IA32_MPERF:
+        case MSR_IA32_APERF:
+            if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
+                ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
+                goto fail;
+            if ( !is_cpufreq_controller(v->domain) )
+                break;
+            if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
+                goto fail;
+            break;
+        case MSR_IA32_PERF_CTL:
+        case MSR_IA32_THERM_CONTROL:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+                goto fail;
+            if ( !is_cpufreq_controller(v->domain) )
+                break;
+            if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
+                goto fail;
+            break;
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
                 break;
-
             if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
                  (eax != l) || (edx != h) )
                 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
@@ -1736,8 +1912,12 @@ static int emulate_privileged_op(struct 
         }
         break;
 
+    case 0x31: /* RDTSC */
+        rdtsc(regs->eax, regs->edx);
+        break;
+
     case 0x32: /* RDMSR */
-        switch ( regs->ecx )
+        switch ( (u32)regs->ecx )
         {
 #ifdef CONFIG_X86_64
         case MSR_FS_BASE:
@@ -1759,10 +1939,48 @@ static int emulate_privileged_op(struct 
             regs->edx = v->arch.guest_context.gs_base_user >> 32;
             break;
 #endif
+        case MSR_K8_FIDVID_CTL:
+        case MSR_K8_FIDVID_STATUS:
+        case MSR_K8_PSTATE_LIMIT:
+        case MSR_K8_PSTATE_CTRL:
+        case MSR_K8_PSTATE_STATUS:
+        case MSR_K8_PSTATE0:
+        case MSR_K8_PSTATE1:
+        case MSR_K8_PSTATE2:
+        case MSR_K8_PSTATE3:
+        case MSR_K8_PSTATE4:
+        case MSR_K8_PSTATE5:
+        case MSR_K8_PSTATE6:
+        case MSR_K8_PSTATE7:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+                goto fail;
+            if ( !is_cpufreq_controller(v->domain) )
+            {
+                regs->eax = regs->edx = 0;
+                break;
+            }
+            if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
+                goto fail;
+            break;
         case MSR_EFER:
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
                 goto fail;
             break;
+        case MSR_IA32_MISC_ENABLE:
+            if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
+                goto fail;
+            regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+                           MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
+            regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+                         MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+                         MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+            break;
+        case MSR_IA32_THERM_CONTROL:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+                goto fail;
+            if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
+                goto fail;
+            break;
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
             {
@@ -2063,13 +2281,13 @@ void set_task_gate(unsigned int n, unsig
 void set_tss_desc(unsigned int n, void *addr)
 {
     _set_tssldt_desc(
-        gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+        per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)addr,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         9);
 #ifdef CONFIG_COMPAT
     _set_tssldt_desc(
-        compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+         per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)addr,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         11);
diff -Naurp xen/arch/x86/x86_32/asm-offsets.c xen-redhat/arch/x86/x86_32/asm-offsets.c
--- xen/arch/x86/x86_32/asm-offsets.c
+++ xen-redhat/arch/x86/x86_32/asm-offsets.c
@@ -114,4 +114,7 @@ void __dummy__(void)
     BLANK();
 
     DEFINE(IRQSTAT_shift, LOG_2(sizeof(irq_cpustat_t)));
+    BLANK();
+
+    OFFSET(CPUINFO_ext_features, struct cpuinfo_x86, x86_capability[1]);
 }
diff -Naurp xen/arch/x86/x86_32/mm.c xen-redhat/arch/x86/x86_32/mm.c
--- xen/arch/x86/x86_32/mm.c
+++ xen-redhat/arch/x86/x86_32/mm.c
@@ -191,7 +191,7 @@ void __init subarch_init_memory(void)
     {
         /* Guest kernel runs in ring 0, not ring 1. */
         struct desc_struct *d;
-        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+        d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
         d[0].b &= ~_SEGMENT_DPL;
         d[1].b &= ~_SEGMENT_DPL;
     }
diff -Naurp xen/arch/x86/x86_32/seg_fixup.c xen-redhat/arch/x86/x86_32/seg_fixup.c
--- xen/arch/x86/x86_32/seg_fixup.c
+++ xen-redhat/arch/x86/x86_32/seg_fixup.c
@@ -42,7 +42,7 @@
 #define O  OPCODE_BYTE
 #define M  HAS_MODRM
 
-static unsigned char insn_decode[256] = {
+static const u8 insn_decode[256] = {
     /* 0x00 - 0x0F */
     O|M, O|M, O|M, O|M, X, X, X, X,
     O|M, O|M, O|M, O|M, X, X, X, X,
@@ -69,7 +69,7 @@ static unsigned char insn_decode[256] = 
     X, X, X, X, X, X, X, X,
     /* 0x80 - 0x8F */
     O|M|1, O|M|4, O|M|1, O|M|1, O|M, O|M, O|M, O|M,
-    O|M, O|M, O|M, O|M, O|M, O|M, O|M, X,
+    O|M, O|M, O|M, O|M, O|M, X|M, O|M, O|M,
     /* 0x90 - 0x9F */
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
@@ -89,17 +89,28 @@ static unsigned char insn_decode[256] = 
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
     /* 0xF0 - 0xFF */
-    X, X, X, X, X, X, X, X,
+    X, X, X, X, X, X, O|M, O|M,
     X, X, X, X, X, X, O|M, O|M
 };
 
-static unsigned char twobyte_decode[256] = {
+static const u8 float_decode[64] = {
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xD8 */
+    O|M, X, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xD9 */
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDA */
+    O|M, X, O|M, O|M, X, O|M, X, O|M, /* 0xDB */
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDC */
+    O|M, O|M, O|M, O|M, O|M, X, O|M, O|M, /* 0xDD */
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDE */
+    O|M, X, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xDF */
+};
+
+static const u8 twobyte_decode[256] = {
     /* 0x00 - 0x0F */
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
     /* 0x10 - 0x1F */
     X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    O|M, X, X, X, X, X, X, X,
     /* 0x20 - 0x2F */
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
@@ -122,16 +133,16 @@ static unsigned char twobyte_decode[256]
     X, X, X, X, X, X, X, X,
     X, X, X, X, X, X, X, X,
     /* 0x90 - 0x9F */
-    X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M,
+    O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M,
     /* 0xA0 - 0xAF */
-    X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    X, X, X, O|M, O|M|1, O|M, O|M, X,
+    X, X, X, O|M, O|M|1, O|M, X, O|M,
     /* 0xB0 - 0xBF */
-    X, X, X, X, X, X, X, X,
-    X, X, X, X, X, X, X, X,
+    X, X, X, O|M, X, X, O|M, O|M,
+    X, X, O|M|1, O|M, O|M, O|M, O|M, O|M,
     /* 0xC0 - 0xCF */
-    X, X, X, X, X, X, X, X,
+    O|M, O|M, X, O|M, X, X, X, O|M,
     X, X, X, X, X, X, X, X,
     /* 0xD0 - 0xDF */
     X, X, X, X, X, X, X, X,
@@ -155,22 +166,22 @@ static unsigned char twobyte_decode[256]
  */
 int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit)
 {
-    struct vcpu *d = current;
-    unsigned long *table, a, b;
-    int            ldt = !!(seg & 4);
-    int            idx = (seg >> 3) & 8191;
+    struct vcpu *curr = current;
+    uint32_t    *table, a, b;
+    int          ldt = !!(seg & 4);
+    int          idx = (seg >> 3) & 8191;
 
     /* Get base and check limit. */
     if ( ldt )
     {
-        table = (unsigned long *)LDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.ldt_ents )
+        table = (uint32_t *)LDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.ldt_ents )
             goto fail;
     }
     else /* gdt */
     {
-        table = (unsigned long *)GDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.gdt_ents )
+        table = (uint32_t *)GDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.gdt_ents )
             goto fail;
     }
 
@@ -221,29 +232,29 @@ int linearise_address(u16 seg, unsigned 
 
 int fixup_seg(u16 seg, unsigned long offset)
 {
-    struct vcpu *d = current;
-    unsigned long *table, a, b, base, limit;
-    int            ldt = !!(seg & 4);
-    int            idx = (seg >> 3) & 8191;
+    struct vcpu *curr = current;
+    uint32_t    *table, a, b, base, limit;
+    int          ldt = !!(seg & 4);
+    int          idx = (seg >> 3) & 8191;
 
     /* Get base and check limit. */
     if ( ldt )
     {
-        table = (unsigned long *)LDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.ldt_ents )
+        table = (uint32_t *)LDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.ldt_ents )
         {
             dprintk(XENLOG_DEBUG, "Segment %04x out of LDT range (%ld)\n",
-                    seg, d->arch.guest_context.ldt_ents);
+                    seg, curr->arch.guest_context.ldt_ents);
             goto fail;
         }
     }
     else /* gdt */
     {
-        table = (unsigned long *)GDT_VIRT_START(d);
-        if ( idx >= d->arch.guest_context.gdt_ents )
+        table = (uint32_t *)GDT_VIRT_START(curr);
+        if ( idx >= curr->arch.guest_context.gdt_ents )
         {
             dprintk(XENLOG_DEBUG, "Segment %04x out of GDT range (%ld)\n",
-                    seg, d->arch.guest_context.gdt_ents);
+                    seg, curr->arch.guest_context.gdt_ents);
             goto fail;
         }
     }
@@ -261,7 +272,7 @@ int fixup_seg(u16 seg, unsigned long off
                _SEGMENT_G|_SEGMENT_CODE|_SEGMENT_DPL)) != 
          (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB|_SEGMENT_G|_SEGMENT_DPL) )
     {
-        dprintk(XENLOG_DEBUG, "Bad segment %08lx:%08lx\n", a, b);
+        dprintk(XENLOG_DEBUG, "Bad segment %08x:%08x\n", a, b);
         goto fail;
     }
 
@@ -291,8 +302,7 @@ int fixup_seg(u16 seg, unsigned long off
         }
     }
 
-    dprintk(XENLOG_DEBUG, "None of the above! "
-            "(%08lx:%08lx, %08lx, %08lx, %08lx)\n",
+    dprintk(XENLOG_DEBUG, "None of the above! (%08x:%08x, %08x, %08x, %08x)\n",
             a, b, base, limit, base+limit);
 
  fail:
@@ -315,18 +325,16 @@ int fixup_seg(u16 seg, unsigned long off
  */
 int gpf_emulate_4gb(struct cpu_user_regs *regs)
 {
-    struct vcpu *d = current;
-    struct trap_info   *ti;
-    struct trap_bounce *tb;
-    u8            modrm, mod, reg, rm, decode;
-    void         *memreg;
-    unsigned long offset;
-    u8            disp8;
-    u32           disp32 = 0;
+    struct vcpu   *curr = current;
+    u8             modrm, mod, rm, decode;
+    const u32     *base, *index = NULL;
+    unsigned long  offset;
+    s8             disp8;
+    s32            disp32 = 0;
     u8            *eip;         /* ptr to instruction start */
     u8            *pb, b;       /* ptr into instr. / current instr. byte */
-    int            gs_override = 0;
-    int            twobyte = 0;
+    int            gs_override = 0, scale = 0, opcode = -1;
+    const u8      *table = insn_decode;
 
     /* WARNING: We only work for ring-3 segments. */
     if ( unlikely(vm86_mode(regs)) || unlikely(!ring_3(regs)) )
@@ -357,6 +365,12 @@ int gpf_emulate_4gb(struct cpu_user_regs
             goto fail;
         }
 
+        if ( opcode != -1 )
+        {
+            opcode = (opcode << 8) | b;
+            break;
+        }
+
         switch ( b )
         {
         case 0x67: /* Address-size override */
@@ -375,6 +389,30 @@ int gpf_emulate_4gb(struct cpu_user_regs
         case 0x65: /* GS override */
             gs_override = 1;
             break;
+        case 0x0f: /* Not really a prefix byte */
+            table = twobyte_decode;
+            opcode = b;
+            break;
+        case 0xd8: /* Math coprocessor instructions.  */
+        case 0xd9:
+        case 0xda:
+        case 0xdb:
+        case 0xdc:
+        case 0xdd:
+        case 0xde:
+        case 0xdf:
+            /* Float opcodes have a secondary opcode in the modrm byte.  */
+            table = float_decode;
+            if ( get_user(modrm, pb + 1) )
+            {
+                dprintk(XENLOG_DEBUG, "Fault while extracting modrm byte\n");
+                goto page_fault;
+            }
+
+            opcode = (b << 8) | modrm;
+            b = ((b & 7) << 3) + ((modrm >> 3) & 7);
+            goto done_prefix;
+
         default: /* Not a prefix byte */
             goto done_prefix;
         }
@@ -387,47 +425,28 @@ int gpf_emulate_4gb(struct cpu_user_regs
         goto fail;
     }
 
-    decode = insn_decode[b]; /* opcode byte */
+    decode = table[b];
     pb++;
-    if ( decode == 0 && b == 0x0f )
-    {
-        twobyte = 1;
 
-        if ( get_user(b, pb) )
-        {
-            dprintk(XENLOG_DEBUG,
-                    "Fault while accessing byte %ld of instruction\n",
-                    (long)(pb-eip));
-            goto page_fault;
-        }
-
-        if ( (pb - eip) >= 15 )
-        {
-            dprintk(XENLOG_DEBUG, "Too many opcode bytes for a "
-                    "legal instruction\n");
-            goto fail;
-        }
-
-        decode = twobyte_decode[b];
-        pb++;
-    }
-
-    if ( decode == 0 )
+    if ( !(decode & OPCODE_BYTE) )
     {
-        dprintk(XENLOG_DEBUG, "Unsupported %sopcode %02x\n",
-                twobyte ? "two byte " : "", b);
+        if (opcode == -1)
+            dprintk(XENLOG_DEBUG, "Unsupported opcode %02x\n", b);
+        else
+            dprintk(XENLOG_DEBUG, "Unsupported opcode %02x %02x\n",
+                    opcode >> 8, opcode & 255);
         goto fail;
     }
 
     if ( !(decode & HAS_MODRM) )
     {
         /* Must be a <disp32>, or bail. */
-        if ( (decode & 7) != 4 )
+        if ( (decode & INSN_SUFFIX_BYTES) != 4 )
             goto fail;
 
         if ( get_user(offset, (u32 *)pb) )
         {
-            dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n");
+            dprintk(XENLOG_DEBUG, "Fault while extracting <moffs32>.\n");
             goto page_fault;
         }
         pb += 4;
@@ -448,29 +467,39 @@ int gpf_emulate_4gb(struct cpu_user_regs
     pb++;
 
     mod = (modrm >> 6) & 3;
-    reg = (modrm >> 3) & 7;
     rm  = (modrm >> 0) & 7;
 
     if ( rm == 4 )
     {
-        dprintk(XENLOG_DEBUG, "FIXME: Add decoding for the SIB byte.\n");
-        goto fixme;
+        u8 sib;
+
+        if ( get_user(sib, pb) )
+        {
+            dprintk(XENLOG_DEBUG, "Fault while extracting sib byte\n");
+            goto page_fault;
+        }
+
+        pb++;
+
+        rm = sib & 7;
+        if ( (sib & 0x38) != 0x20 )
+            index = decode_register((sib >> 3) & 7, regs, 0);
+        scale = sib >> 6;
     }
 
     /* Decode R/M field. */
-    memreg = decode_register(rm,  regs, 0);
+    base = decode_register(rm, regs, 0);
 
     /* Decode Mod field. */
-    switch ( modrm >> 6 )
+    switch ( mod )
     {
     case 0:
-        disp32 = 0;
         if ( rm == 5 ) /* disp32 rather than (EBP) */
         {
-            memreg = NULL;
+            base = NULL;
             if ( get_user(disp32, (u32 *)pb) )
             {
-                dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n");
+                dprintk(XENLOG_DEBUG, "Fault while extracting <base32>.\n");
                 goto page_fault;
             }
             pb += 4;
@@ -484,13 +513,13 @@ int gpf_emulate_4gb(struct cpu_user_regs
             goto page_fault;
         }
         pb++;
-        disp32 = (disp8 & 0x80) ? (disp8 | ~0xff) : disp8;;
+        disp32 = disp8;
         break;
 
     case 2:
         if ( get_user(disp32, (u32 *)pb) )
         {
-            dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n");
+            dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n");
             goto page_fault;
         }
         pb += 4;
@@ -502,8 +531,10 @@ int gpf_emulate_4gb(struct cpu_user_regs
     }
 
     offset = disp32;
-    if ( memreg != NULL )
-        offset += *(u32 *)memreg;
+    if ( base != NULL )
+        offset += *base;
+    if ( index != NULL )
+        offset += *index << scale;
 
  skip_modrm:
     if ( !fixup_seg((u16)regs->gs, offset) )
@@ -513,10 +544,11 @@ int gpf_emulate_4gb(struct cpu_user_regs
     perfc_incr(seg_fixups);
 
     /* If requested, give a callback on otherwise unused vector 15. */
-    if ( VM_ASSIST(d->domain, VMASST_TYPE_4gb_segments_notify) )
+    if ( VM_ASSIST(curr->domain, VMASST_TYPE_4gb_segments_notify) )
     {
-        ti  = &d->arch.guest_context.trap_ctxt[15];
-        tb  = &d->arch.trap_bounce;
+        struct trap_info   *ti  = &curr->arch.guest_context.trap_ctxt[15];
+        struct trap_bounce *tb  = &curr->arch.trap_bounce;
+
         tb->flags      = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
         tb->error_code = pb - eip;
         tb->cs         = ti->cs;
@@ -527,13 +559,6 @@ int gpf_emulate_4gb(struct cpu_user_regs
 
     return EXCRET_fault_fixed;
 
- fixme:
-    dprintk(XENLOG_DEBUG, "Undecodable instruction "
-            "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x "
-            "caused GPF(0) at %04x:%08x\n",
-            eip[0], eip[1], eip[2], eip[3],
-            eip[4], eip[5], eip[6], eip[7],
-            regs->cs, regs->eip);
  fail:
     return 0;
 
diff -Naurp xen/arch/x86/x86_32/supervisor_mode_kernel.S xen-redhat/arch/x86/x86_32/supervisor_mode_kernel.S
--- xen/arch/x86/x86_32/supervisor_mode_kernel.S
+++ xen-redhat/arch/x86/x86_32/supervisor_mode_kernel.S
@@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack)
         # %gs:%esi now points to the guest stack before the
         # interrupt/exception occured.
 
-        /*
-         * Reverse the __TSS macro, giving us the CPU number.
-         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
-         */
-        str   %ecx
-        shrl  $3,%ecx                                   # Calculate GDT index for TSS.
-        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
-        shll  $6,%ecx                                   # Each TSS entry is 0x80 bytes
-        addl  $init_tss,%ecx                            # but we have 2*cpu from above.
+        movl  $PER_CPU_GDT_ENTRY*8,%ecx
+        lsll  %ecx,%ecx
+        shll  $7,%ecx                                   # Each TSS entry is 0x80 bytes
+        addl  $init_tss,%ecx
 
         # Load Xen stack from TSS.
         movw  TSS_ss0(%ecx),%ax
diff -Naurp xen/arch/x86/x86_32/traps.c xen-redhat/arch/x86/x86_32/traps.c
--- xen/arch/x86/x86_32/traps.c
+++ xen-redhat/arch/x86/x86_32/traps.c
@@ -136,19 +136,20 @@ void show_page_walk(unsigned long addr)
     unmap_domain_page(l1t);
 }
 
-#define DOUBLEFAULT_STACK_SIZE 2048
-static struct tss_struct doublefault_tss;
-static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
-
+DEFINE_PER_CPU(struct tss_struct *, doublefault_tss);
+static unsigned char __attribute__ ((__section__ (".bss.page_aligned")))
+    boot_cpu_doublefault_space[PAGE_SIZE];
 asmlinkage void do_double_fault(void)
 {
-    struct tss_struct *tss = &doublefault_tss;
-    unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
+    struct tss_struct *tss;
+    unsigned int cpu;
 
     watchdog_disable();
 
     console_force_unlock();
 
+    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
     /* Find information saved during fault and dump it to the console. */
     tss = &init_tss[cpu];
     printk("*** DOUBLE FAULT ***\n");
@@ -234,34 +235,36 @@ unsigned long do_iret(void)
 
 void __init percpu_traps_init(void)
 {
-    struct tss_struct *tss = &doublefault_tss;
+    struct tss_struct *tss = this_cpu(doublefault_tss);
     asmlinkage int hypercall(void);
 
-    if ( smp_processor_id() != 0 )
-        return;
+    if ( !tss )
+    {
+        /* The hypercall entry vector is only accessible from ring 1. */
+        _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall);
 
-    /* The hypercall entry vector is only accessible from ring 1. */
-    _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall);
+        tss = (void *)boot_cpu_doublefault_space;
+        this_cpu(doublefault_tss) = tss;
+    }
 
     /*
      * Make a separate task for double faults. This will get us debug output if
      * we blow the kernel stack.
      */
-    memset(tss, 0, sizeof(*tss));
     tss->ds     = __HYPERVISOR_DS;
     tss->es     = __HYPERVISOR_DS;
     tss->ss     = __HYPERVISOR_DS;
-    tss->esp    = (unsigned long)&doublefault_stack[DOUBLEFAULT_STACK_SIZE];
+    tss->esp    = (unsigned long)tss + PAGE_SIZE;
     tss->__cr3  = __pa(idle_pg_table);
     tss->cs     = __HYPERVISOR_CS;
     tss->eip    = (unsigned long)do_double_fault;
     tss->eflags = 2;
     tss->bitmap = IOBMP_INVALID_OFFSET;
     _set_tssldt_desc(
-        gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+        this_cpu(gdt_table) + DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)tss, 235, 9);
 
-    set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
+    set_task_gate(TRAP_double_fault, DOUBLEFAULT_TSS_ENTRY << 3);
 }
 
 void init_int80_direct_trap(struct vcpu *v)
diff -Naurp xen/arch/x86/x86_64/asm-offsets.c xen-redhat/arch/x86/x86_64/asm-offsets.c
--- xen/arch/x86/x86_64/asm-offsets.c
+++ xen-redhat/arch/x86/x86_64/asm-offsets.c
@@ -124,4 +124,7 @@ void __dummy__(void)
 #endif
 
     DEFINE(IRQSTAT_shift, LOG_2(sizeof(irq_cpustat_t)));
+    BLANK();
+
+    OFFSET(CPUINFO_ext_features, struct cpuinfo_x86, x86_capability[1]);
 }
diff -Naurp xen/arch/x86/x86_64/compat/mm.c xen-redhat/arch/x86/x86_64/compat/mm.c
--- xen/arch/x86/x86_64/compat/mm.c
+++ xen-redhat/arch/x86/x86_64/compat/mm.c
@@ -298,9 +298,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
 
                 BUG_ON(left == arg1);
                 BUG_ON(left > count);
-                guest_handle_add_offset(nat_ops, count - left);
-                BUG_ON(left + i < count);
-                guest_handle_add_offset(cmp_uops, (signed int)(count - left - i));
+                guest_handle_add_offset(nat_ops, i - left);
+                guest_handle_subtract_offset(cmp_uops, left);
                 left = 1;
                 BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, cmp_uops));
                 BUG_ON(left != arg1);
diff -Naurp xen/arch/x86/x86_64/Makefile xen-redhat/arch/x86/x86_64/Makefile
--- xen/arch/x86/x86_64/Makefile
+++ xen-redhat/arch/x86/x86_64/Makefile
@@ -1,4 +1,5 @@
 subdir-y += compat
+subdir-y += stratus
 
 obj-y += entry.o
 obj-y += gpr_switch.o
diff -Naurp xen/arch/x86/x86_64/mm.c xen-redhat/arch/x86/x86_64/mm.c
--- xen/arch/x86/x86_64/mm.c
+++ xen-redhat/arch/x86/x86_64/mm.c
@@ -428,7 +428,7 @@ int check_descriptor(const struct domain
 
 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
 {
-    if ( d == NULL )
+    if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
         return bits;
     return min(d->arch.physaddr_bitsize, bits);
 }
diff -Naurp xen/arch/x86/x86_64/stratus/host.c xen-redhat/arch/x86/x86_64/stratus/host.c
--- xen/arch/x86/x86_64/stratus/host.c
+++ xen-redhat/arch/x86/x86_64/stratus/host.c
@@ -0,0 +1,107 @@
+//#include "cc_os_defines.h"
+//#include "host.h"
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <xen/spinlock.h>
+
+unsigned int
+OS_READ_REG_UINT32(
+    unsigned int * Reg
+    )
+{
+	return readl(Reg);
+}
+
+void
+OS_WRITE_REG_UINT32(
+    unsigned int * Reg,
+    unsigned int   Value
+    )
+{
+	writel(Value,Reg);
+}
+
+
+// - misc apic defines
+#define DELIVERY_PENDING        0x00001000
+#define DESTINATION_MASK        0xFF000000
+#define DESTINATION_SHIFT       24
+#define DELIVERY_MODE_MASK      0x00000700
+#define DELIVER_SMI             0x00000200
+#define DELIVER_NMI             0x00000400
+#define DELIVER_INIT            0x00000500
+#define DELIVER_STARTUP         0x00000600
+#define PHYSICAL_DESTINATION    0x00000000
+#define LOGICAL_DESTINATION     0x00000800
+#define EDGE_TRIGGERED          0x00000000
+#define LEVEL_ASSERT            0x00004000
+#define INT_MASKED              0x00010000
+#define ICR_SHORTHAND_MASK      0x000C0000
+#define ICR_USE_DEST_FIELD      0x00000000
+#define ICR_SELF                0x00040000
+#define ICR_ALL_INCL_SELF       0x00080000
+#define ICR_ALL_EXCL_SELF       0x000C0000
+
+
+#define APIC_REG_UINT32(Base,ByteOffset) \
+        (((unsigned int *)(Base))[(ByteOffset)/sizeof(unsigned int)])
+#define LU_ID_REGISTER    0x00000020
+#define LU_INT_CMD_LOW    0x00000300
+#define LU_INT_CMD_HIGH   0x00000310
+#define LU_INT_VECTOR_1   0x00000360
+
+// - used to poll until the apic is not busy
+#define STALL_WHILE_APIC_BUSY(ApicBase) \
+    do { \
+        while (OS_READ_REG_UINT32( \
+                    &APIC_REG_UINT32((ApicBase),LU_INT_CMD_LOW)) & \
+               DELIVERY_PENDING) \
+            ; \
+    } while(0)
+
+#define APIC_SMI_TO_PHYS_DEST  (DELIVER_SMI | PHYSICAL_DESTINATION | \
+                                ICR_USE_DEST_FIELD | EDGE_TRIGGERED)
+
+#define SMI_DEST_ALL   0xffffffff
+#define SMI_DEST_SELF  0xfffffffe
+
+// - get apic processor id 
+#define APIC_PROC_ID(ApicBase) \
+    ((OS_READ_REG_UINT32(&APIC_REG_UINT32((ApicBase),LU_ID_REGISTER)) \
+     & 0x0F000000) >> 24)
+
+unsigned int HostGetProcId(void)
+{
+    return APIC_PROC_ID(APIC_BASE);
+}
+
+void host_request_smi(unsigned int dest)
+{
+	unsigned char ProcId;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	STALL_WHILE_APIC_BUSY(APIC_BASE);
+
+	switch (dest) {
+	case SMI_DEST_ALL:
+		OS_WRITE_REG_UINT32(&APIC_REG_UINT32(APIC_BASE,LU_INT_CMD_HIGH),
+			(unsigned int)(0xff << DESTINATION_SHIFT));
+	    break;
+	case SMI_DEST_SELF:
+	default:
+		ProcId = (unsigned char)(dest == SMI_DEST_SELF ? HostGetProcId() : dest);
+		OS_WRITE_REG_UINT32(&APIC_REG_UINT32(APIC_BASE,LU_INT_CMD_HIGH),
+			ProcId << DESTINATION_SHIFT);
+	}
+
+	OS_WRITE_REG_UINT32(&APIC_REG_UINT32(APIC_BASE,LU_INT_CMD_LOW),
+			APIC_SMI_TO_PHYS_DEST);
+
+	STALL_WHILE_APIC_BUSY(APIC_BASE);
+
+	local_irq_restore(flags);
+}
diff -Naurp xen/arch/x86/x86_64/stratus/Makefile xen-redhat/arch/x86/x86_64/stratus/Makefile
--- xen/arch/x86/x86_64/stratus/Makefile
+++ xen-redhat/arch/x86/x86_64/stratus/Makefile
@@ -0,0 +1 @@
+obj-y += stratus.o host.o
diff -Naurp xen/arch/x86/x86_64/stratus/stratus.c xen-redhat/arch/x86/x86_64/stratus/stratus.c
--- xen/arch/x86/x86_64/stratus/stratus.c
+++ xen-redhat/arch/x86/x86_64/stratus/stratus.c
@@ -0,0 +1,211 @@
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/sched.h>
+#include <xen/dmi.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#ifdef __XEN_COMPAT_H
+#undef __XEN_COMPAT_H
+#endif
+#include <public/platform.h>
+
+extern void host_request_smi(unsigned int dest);
+
+static long cc_cr4(xenpf_stratus_call_t *cc_call) {
+	int rw = cc_call->u.cr4.rw;
+	unsigned long cr4;
+
+	if (rw) {	// Write
+		return -ENOSYS;
+	} else {	// Read
+		asm("movq %%cr4,%0" : "=r" (cr4));
+		cc_call->u.cr4.cr4 = cr4;
+	}
+
+	return 0;
+}
+
+static long cc_cpuid(xenpf_stratus_call_t *cc_call) {
+	cpuid(  cc_call->u.cpuid.op,
+		&cc_call->u.cpuid.eax,
+		&cc_call->u.cpuid.ebx,
+		&cc_call->u.cpuid.ecx,
+		&cc_call->u.cpuid.edx );
+
+	return 0;
+}
+
+static long cc_rw_msr(xenpf_stratus_call_t *cc_call) {
+	if (cc_call->u.msr.rw == 0) {
+		// Read
+		rdmsrl(cc_call->u.msr.msr, cc_call->u.msr.val);
+	} else {
+		wrmsrl(cc_call->u.msr.msr, cc_call->u.msr.val);
+	}
+
+	return 0;
+}
+
+static long cc_lapic_id(xenpf_stratus_call_t *cc_call) {
+	cc_call->u.ls.id = GET_APIC_ID(apic_read(APIC_ID));
+	return 0;
+}
+
+#define DUMP_VECTOR_PHYS  	(0xf00)
+#define HOST_BIOS_VECTOR_PHYS	(0xff0)
+#define HOST_BIOS_VECTOR_SIZE	(0x10)
+
+
+static long cc_rw_hbv(xenpf_stratus_call_t *cc_call) {
+	int rw = cc_call->u.rw.rw;
+	int size = cc_call->u.rw.size;
+	unsigned long where = cc_call->u.rw.where;
+
+	if (((unsigned long)where + (unsigned long)size) > 
+			HOST_BIOS_VECTOR_SIZE) {
+		return -EFAULT;
+	}
+
+	where += (unsigned long)__va(HOST_BIOS_VECTOR_PHYS);
+
+	if (rw) {	// Write
+		if (copy_from_user((void*)where, cc_call->u.rw.data, size)) {
+			return -EFAULT;
+		}
+	} else {	// Read
+		if (copy_to_user(cc_call->u.rw.data, (void*)where, size)) {
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static long cc_rw_dumpvec(xenpf_stratus_call_t *cc_call) {
+	int rw = cc_call->u.rw.rw;
+	int size = cc_call->u.rw.size;
+	void *where = __va(DUMP_VECTOR_PHYS);
+
+	if (size > sizeof(int)) {
+		return -EINVAL;
+	}
+
+	if (rw) {	// Write
+		if (copy_from_user((void*)where, cc_call->u.rw.data, size)) {
+			return -EFAULT;
+		}
+	} else {	// Read
+		if (copy_to_user(cc_call->u.rw.data, (void*)where, size)) {
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static long cc_rw_region(xenpf_stratus_call_t *cc_call) {
+	switch (cc_call->u.rw.region) {
+		case RW_HBV:
+			return cc_rw_hbv(cc_call);
+			break;
+		case RW_DUMPVEC:
+			return cc_rw_dumpvec(cc_call);
+			break;
+		default:
+			return -EINVAL;
+	}
+}
+
+static long cc_smi(xenpf_stratus_call_t *cc_call) {
+	host_request_smi(cc_call->u.smi.dest);
+	return 0;
+}
+
+static long cc_hbv_memset(xenpf_stratus_call_t *cc_call) {
+	int size = cc_call->u.hbv_m.size;
+
+	if (size > HOST_BIOS_VECTOR_SIZE)
+		size = HOST_BIOS_VECTOR_SIZE;
+
+	memset(__va(HOST_BIOS_VECTOR_PHYS), cc_call->u.hbv_m.val, cc_call->u.hbv_m.size);
+
+	return 0;
+}
+
+static int locked_out = 1;
+
+static int found_stratus(struct dmi_system_id *d)
+{
+	printk("Stratus platform detected.\n");
+	return 0;
+}
+
+#define NO_MATCH	{ DMI_NONE, NULL}
+#define MATCH		DMI_MATCH
+
+static struct dmi_system_id stratus_platform[] = {
+	{ found_stratus, "Stratus Platform", {
+			MATCH(DMI_BOARD_VENDOR, "Stratus"),
+			NO_MATCH, NO_MATCH, NO_MATCH
+			} },
+	{ NULL, NULL, }
+};
+
+static int check_stratus_dmi(void) {
+	// Run dmi scan looking for Stratus Vendor string.
+	if (dmi_check_system(stratus_platform))
+		return 0;
+
+	return 1;
+}
+
+long do_stratus(xenpf_stratus_call_t *call) {
+	long ret = -EINVAL;
+
+	if (!IS_PRIV(current->domain))
+		return -EPERM;
+
+	if (call->cmd == CC_VALIDATE_PLATFORM)
+		locked_out = check_stratus_dmi();
+
+	if (locked_out)
+		return -EPERM;
+
+	switch (call->cmd) {
+	case CC_TRIGGER_SMI:
+		ret = cc_smi(call);
+		break;
+	case CC_HBV_MEMSET:
+		ret = cc_hbv_memset(call);
+		break;
+	case CC_RW_REGION:
+		ret = cc_rw_region(call);
+		break;
+	case CC_LAPIC_ID:
+		ret = cc_lapic_id(call);
+		break;
+	case CC_CR4:
+		ret = cc_cr4(call);
+		break;
+	case CC_CPUID:
+		ret = cc_cpuid(call);
+		break;
+	case CC_RW_MSR:
+		ret = cc_rw_msr(call);
+		break;
+	case CC_VALIDATE_PLATFORM:
+		ret = 0;	// If we made it here, we are on a Stratus box.
+		break;
+	default:
+		printk("%s:line %d, unknown command %d\n", __func__,
+			__LINE__, call->cmd);
+		break;
+	}
+
+	call->ret = ret;
+
+	return ret;
+}
+
diff -Naurp xen/arch/x86/x86_64/traps.c xen-redhat/arch/x86/x86_64/traps.c
--- xen/arch/x86/x86_64/traps.c
+++ xen-redhat/arch/x86/x86_64/traps.c
@@ -147,15 +147,14 @@ void show_page_walk(unsigned long addr)
 asmlinkage void double_fault(void);
 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
 {
-    unsigned int cpu, tr;
-
-    asm ( "str %0" : "=r" (tr) );
-    cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
+    unsigned int cpu;
 
     watchdog_disable();
 
     console_force_unlock();
 
+    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
     /* Find information saved during fault and dump it to the console. */
     printk("*** DOUBLE FAULT ***\n");
     print_xen_info();
diff -Naurp xen/arch/x86/x86_emulate.c xen-redhat/arch/x86/x86_emulate.c
--- xen/arch/x86/x86_emulate.c
+++ xen-redhat/arch/x86/x86_emulate.c
@@ -103,8 +103,8 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, DstReg|SrcMem|ModRM, DstReg|SrcMem16|ModRM|Mov,
     0, 0, 0, 0,
     /* 0x68 - 0x6F */
-    ImplicitOps|Mov, DstMem|SrcImm|ModRM|Mov,
-    ImplicitOps|Mov, DstMem|SrcImmByte|ModRM|Mov,
+    ImplicitOps|Mov, DstReg|SrcImm|ModRM|Mov,
+    ImplicitOps|Mov, DstReg|SrcImmByte|ModRM|Mov,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0x70 - 0x77 */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -1207,34 +1207,37 @@ x86_emulate(
 
     case 0x69: /* imul imm16/32 */
     case 0x6b: /* imul imm8 */ {
-        unsigned long reg = *(long *)decode_register(modrm_reg, &_regs, 0);
+        unsigned long src1; /* ModR/M source operand */
+        if ( ea.type == OP_REG )
+                src1 = *ea.reg;
+        else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
+                                &src1, op_bytes, ctxt)) )
+                goto done;
         _regs.eflags &= ~(EFLG_OF|EFLG_CF);
         switch ( dst.bytes )
         {
         case 2:
             dst.val = ((uint32_t)(int16_t)src.val *
-                       (uint32_t)(int16_t)reg);
+                       (uint32_t)(int16_t)src1);
             if ( (int16_t)dst.val != (uint32_t)dst.val )
                 _regs.eflags |= EFLG_OF|EFLG_CF;
             break;
 #ifdef __x86_64__
         case 4:
             dst.val = ((uint64_t)(int32_t)src.val *
-                       (uint64_t)(int32_t)reg);
+                       (uint64_t)(int32_t)src1);
             if ( (int32_t)dst.val != dst.val )
                 _regs.eflags |= EFLG_OF|EFLG_CF;
             break;
 #endif
         default: {
-            unsigned long m[2] = { src.val, reg };
+            unsigned long m[2] = { src.val, src1 };
             if ( imul_dbl(m) )
                 _regs.eflags |= EFLG_OF|EFLG_CF;
             dst.val = m[0];
             break;
         }
         }
-        dst.type = OP_REG;
-        dst.reg  = decode_register(modrm_reg, &_regs, 0);
         break;
     }
 
@@ -1863,7 +1866,7 @@ x86_emulate(
         break;
 
     case 0x9e: /* sahf */
-        *(uint8_t *)_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02;
+        *(uint8_t *)&_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02;
         break;
 
     case 0x9f: /* lahf */
diff -Naurp xen/arch/x86/x86_emulate.c.orig xen-redhat/arch/x86/x86_emulate.c.orig
--- xen/arch/x86/x86_emulate.c.orig
+++ xen-redhat/arch/x86/x86_emulate.c.orig
@@ -0,0 +1,2428 @@
+/******************************************************************************
+ * x86_emulate.c
+ * 
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ * 
+ * Copyright (c) 2005-2007 Keir Fraser
+ * Copyright (c) 2005-2007 XenSource Inc.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __XEN__
+#include <stddef.h>
+#include <stdint.h>
+#include <public/xen.h>
+#else
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/lib.h>
+#include <asm/regs.h>
+#undef cmpxchg
+#endif
+#include <asm-x86/x86_emulate.h>
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp      (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define DstBitBase  (0<<1) /* Memory operand, bit string. */
+#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1) /* Register operand. */
+#define DstMem      (3<<1) /* Memory operand. */
+#define DstMask     (3<<1)
+/* Source operand type. */
+#define SrcNone     (0<<3) /* No source operand. */
+#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<3) /* Register operand. */
+#define SrcMem      (2<<3) /* Memory operand. */
+#define SrcMem16    (3<<3) /* Memory operand (16-bit). */
+#define SrcImm      (4<<3) /* Immediate operand. */
+#define SrcImmByte  (5<<3) /* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<3)
+/* Generic ModRM decode. */
+#define ModRM       (1<<6)
+/* Destination is only written; never read. */
+#define Mov         (1<<7)
+
+static uint8_t opcode_table[256] = {
+    /* 0x00 - 0x07 */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0,
+    /* 0x08 - 0x0F */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0,
+    /* 0x10 - 0x17 */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0,
+    /* 0x18 - 0x1F */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, 0,
+    /* 0x20 - 0x27 */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    /* 0x28 - 0x2F */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    /* 0x30 - 0x37 */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    /* 0x38 - 0x3F */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    /* 0x40 - 0x4F */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x50 - 0x5F */
+    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov,
+    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov,
+    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov,
+    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov,
+    /* 0x60 - 0x67 */
+    ImplicitOps, ImplicitOps, DstReg|SrcMem|ModRM, DstReg|SrcMem16|ModRM|Mov,
+    0, 0, 0, 0,
+    /* 0x68 - 0x6F */
+    ImplicitOps|Mov, DstMem|SrcImm|ModRM|Mov,
+    ImplicitOps|Mov, DstMem|SrcImmByte|ModRM|Mov,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x70 - 0x77 */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x78 - 0x7F */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x80 - 0x87 */
+    ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM,
+    ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM,
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    /* 0x88 - 0x8F */
+    ByteOp|DstMem|SrcReg|ModRM|Mov, DstMem|SrcReg|ModRM|Mov,
+    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    0, DstReg|SrcNone|ModRM, 0, DstMem|SrcNone|ModRM|Mov,
+    /* 0x90 - 0x97 */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x98 - 0x9F */
+    ImplicitOps, ImplicitOps, 0, 0, 0, 0, ImplicitOps, ImplicitOps,
+    /* 0xA0 - 0xA7 */
+    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, 0, 0,
+    /* 0xA8 - 0xAF */
+    ByteOp|DstReg|SrcImm, DstReg|SrcImm,
+    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, 0, 0,
+    /* 0xB0 - 0xB7 */
+    ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov,
+    ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov,
+    ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov,
+    ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov,
+    /* 0xB8 - 0xBF */
+    DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov,
+    DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov,
+    /* 0xC0 - 0xC7 */
+    ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM,
+    ImplicitOps, ImplicitOps,
+    0, 0, ByteOp|DstMem|SrcImm|ModRM|Mov, DstMem|SrcImm|ModRM|Mov,
+    /* 0xC8 - 0xCF */
+    0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xD0 - 0xD7 */
+    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
+    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0xD8 - 0xDF */
+    0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xE0 - 0xE7 */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0xE8 - 0xEF */
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0xF0 - 0xF7 */
+    0, 0, 0, 0,
+    0, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM,
+    /* 0xF8 - 0xFF */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
+};
+
+static uint8_t twobyte_table[256] = {
+    /* 0x00 - 0x07 */
+    0, 0, 0, 0, 0, ImplicitOps, 0, 0,
+    /* 0x08 - 0x0F */
+    ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
+    /* 0x10 - 0x17 */
+    0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x18 - 0x1F */
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    /* 0x20 - 0x27 */
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    0, 0, 0, 0,
+    /* 0x28 - 0x2F */
+    0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x30 - 0x37 */
+    ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0,
+    /* 0x38 - 0x3F */
+    0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x40 - 0x47 */
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    /* 0x48 - 0x4F */
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+    /* 0x50 - 0x5F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x60 - 0x6F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x70 - 0x7F */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x80 - 0x87 */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x88 - 0x8F */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0x90 - 0x97 */
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    /* 0x98 - 0x9F */
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
+    /* 0xA0 - 0xA7 */
+    0, 0, 0, DstBitBase|SrcReg|ModRM, 0, 0, 0, 0, 
+    /* 0xA8 - 0xAF */
+    0, 0, 0, DstBitBase|SrcReg|ModRM, 0, 0, 0, DstReg|SrcMem|ModRM,
+    /* 0xB0 - 0xB7 */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    0, DstBitBase|SrcReg|ModRM,
+    0, 0, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
+    /* 0xB8 - 0xBF */
+    0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
+    DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
+    /* 0xC0 - 0xC7 */
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+    0, 0, 0, ImplicitOps|ModRM,
+    /* 0xC8 - 0xCF */
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    /* 0xD0 - 0xDF */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xE0 - 0xEF */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xF0 - 0xFF */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+    enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+    unsigned int  bytes;
+    unsigned long val, orig_val;
+    union {
+        /* OP_REG: Pointer to register field. */
+        unsigned long *reg;
+        /* OP_MEM: Segment and offset. */
+        struct {
+            enum x86_segment seg;
+            unsigned long    off;
+        } mem;
+    };
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/* Exception definitions. */
+#define EXC_DE  0
+#define EXC_BR  5
+#define EXC_UD  6
+#define EXC_GP 13
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(__x86_64__)
+#define _LO32 "k"          /* force 32-bit operand */
+#define _STK  "%%rsp"      /* stack pointer */
+#elif defined(__i386__)
+#define _LO32 ""           /* force 32-bit operand */
+#define _STK  "%%esp"      /* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp)                           \
+/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+"movl %"_sav",%"_LO32 _tmp"; "                                  \
+"push %"_tmp"; "                                                \
+"push %"_tmp"; "                                                \
+"movl %"_msk",%"_LO32 _tmp"; "                                  \
+"andl %"_LO32 _tmp",("_STK"); "                                 \
+"pushf; "                                                       \
+"notl %"_LO32 _tmp"; "                                          \
+"andl %"_LO32 _tmp",("_STK"); "                                 \
+"andl %"_LO32 _tmp","STR(BITS_PER_LONG/4)"("_STK"); "           \
+"pop  %"_tmp"; "                                                \
+"orl  %"_LO32 _tmp",("_STK"); "                                 \
+"popf; "                                                        \
+"pop  %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp)          \
+/* _sav |= EFLAGS & _msk; */                    \
+"pushf; "                                       \
+"pop  %"_tmp"; "                                \
+"andl %"_msk",%"_LO32 _tmp"; "                  \
+"orl  %"_LO32 _tmp",%"_sav"; "
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy)\
+do{ unsigned long _tmp;                                                    \
+    switch ( (_dst).bytes )                                                \
+    {                                                                      \
+    case 2:                                                                \
+        __asm__ __volatile__ (                                             \
+            _PRE_EFLAGS("0","4","2")                                       \
+            _op"w %"_wx"3,%1; "                                            \
+            _POST_EFLAGS("0","4","2")                                      \
+            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
+            : _wy ((_src).val), "i" (EFLAGS_MASK),                         \
+              "m" (_eflags), "m" ((_dst).val) );                           \
+        break;                                                             \
+    case 4:                                                                \
+        __asm__ __volatile__ (                                             \
+            _PRE_EFLAGS("0","4","2")                                       \
+            _op"l %"_lx"3,%1; "                                            \
+            _POST_EFLAGS("0","4","2")                                      \
+            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
+            : _ly ((_src).val), "i" (EFLAGS_MASK),                         \
+              "m" (_eflags), "m" ((_dst).val) );                           \
+        break;                                                             \
+    case 8:                                                                \
+        __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy);           \
+        break;                                                             \
+    }                                                                      \
+} while (0)
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)\
+do{ unsigned long _tmp;                                                    \
+    switch ( (_dst).bytes )                                                \
+    {                                                                      \
+    case 1:                                                                \
+        __asm__ __volatile__ (                                             \
+            _PRE_EFLAGS("0","4","2")                                       \
+            _op"b %"_bx"3,%1; "                                            \
+            _POST_EFLAGS("0","4","2")                                      \
+            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
+            : _by ((_src).val), "i" (EFLAGS_MASK),                         \
+              "m" (_eflags), "m" ((_dst).val) );                           \
+        break;                                                             \
+    default:                                                               \
+        __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy);\
+        break;                                                             \
+    }                                                                      \
+} while (0)
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                         \
+    __emulate_2op(_op, _src, _dst, _eflags,                                \
+                  "b", "c", "b", "c", "b", "c", "b", "c")
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                         \
+    __emulate_2op(_op, _src, _dst, _eflags,                                \
+                  "b", "q", "w", "r", _LO32, "r", "", "r")
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)                  \
+    __emulate_2op_nobyte(_op, _src, _dst, _eflags,                         \
+                  "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op,_dst,_eflags)                                      \
+do{ unsigned long _tmp;                                                    \
+    switch ( (_dst).bytes )                                                \
+    {                                                                      \
+    case 1:                                                                \
+        __asm__ __volatile__ (                                             \
+            _PRE_EFLAGS("0","3","2")                                       \
+            _op"b %1; "                                                    \
+            _POST_EFLAGS("0","3","2")                                      \
+            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
+            : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );        \
+        break;                                                             \
+    case 2:                                                                \
+        __asm__ __volatile__ (                                             \
+            _PRE_EFLAGS("0","3","2")                                       \
+            _op"w %1; "                                                    \
+            _POST_EFLAGS("0","3","2")                                      \
+            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
+            : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );        \
+        break;                                                             \
+    case 4:                                                                \
+        __asm__ __volatile__ (                                             \
+            _PRE_EFLAGS("0","3","2")                                       \
+            _op"l %1; "                                                    \
+            _POST_EFLAGS("0","3","2")                                      \
+            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
+            : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );        \
+        break;                                                             \
+    case 8:                                                                \
+        __emulate_1op_8byte(_op, _dst, _eflags);                           \
+        break;                                                             \
+    }                                                                      \
+} while (0)
+
+/* Emulate an instruction with quadword operands (x86/64 only). */
+#if defined(__x86_64__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)         \
+do{ __asm__ __volatile__ (                                              \
+        _PRE_EFLAGS("0","4","2")                                        \
+        _op"q %"_qx"3,%1; "                                             \
+        _POST_EFLAGS("0","4","2")                                       \
+        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)               \
+        : _qy ((_src).val), "i" (EFLAGS_MASK),                          \
+          "m" (_eflags), "m" ((_dst).val) );                            \
+} while (0)
+#define __emulate_1op_8byte(_op, _dst, _eflags)                         \
+do{ __asm__ __volatile__ (                                              \
+        _PRE_EFLAGS("0","3","2")                                        \
+        _op"q %1; "                                                     \
+        _POST_EFLAGS("0","3","2")                                       \
+        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)               \
+        : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );         \
+} while (0)
+#elif defined(__i386__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define __emulate_1op_8byte(_op, _dst, _eflags)
+#endif /* __i386__ */
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch_bytes(_size)                                         \
+({ unsigned long _x, _eip = _regs.eip;                                  \
+   if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \
+   _regs.eip += (_size); /* real hardware doesn't truncate */           \
+   generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15,   \
+                         EXC_GP);                                       \
+   rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
+   if ( rc ) goto done;                                                 \
+   _x;                                                                  \
+})
+#define insn_fetch_type(_type) ((_type)insn_fetch_bytes(sizeof(_type)))
+
+#define _truncate_ea(ea, byte_width)            \
+({  unsigned long __ea = (ea);                  \
+    unsigned int _width = (byte_width);         \
+    ((_width == sizeof(unsigned long)) ? __ea : \
+     (__ea & ((1UL << (_width << 3)) - 1)));    \
+})
+#define truncate_ea(ea) _truncate_ea((ea), ad_bytes)
+
+#define mode_64bit() (def_ad_bytes == 8)
+
+#define fail_if(p)                                      \
+do {                                                    \
+    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
+    if ( rc ) goto done;                                \
+} while (0)
+
+/* In future we will be able to generate arbitrary exceptions. */
+#define generate_exception_if(p, e) fail_if(p)
+
+/* To be done... */
+#define mode_ring0() (0)
+#define mode_iopl()  (0)
+
+/* Given byte has even parity (even number of 1s)? */
+static int even_parity(uint8_t v)
+{
+    __asm__ ( "test %%al,%%al; setp %%al"
+              : "=a" (v) : "0" (v) );
+    return v;
+}
+
+/* Update address held in a register, based on addressing mode. */
+#define _register_address_increment(reg, inc, byte_width)               \
+do {                                                                    \
+    int _inc = (inc); /* signed type ensures sign extension to long */  \
+    unsigned int _width = (byte_width);                                 \
+    if ( _width == sizeof(unsigned long) )                              \
+        (reg) += _inc;                                                  \
+    else if ( mode_64bit() )                                            \
+        (reg) = ((reg) + _inc) & ((1UL << (_width << 3)) - 1);          \
+    else                                                                \
+        (reg) = ((reg) & ~((1UL << (_width << 3)) - 1)) |               \
+                (((reg) + _inc) & ((1UL << (_width << 3)) - 1));        \
+} while (0)
+#define register_address_increment(reg, inc) \
+    _register_address_increment((reg), (inc), ad_bytes)
+
+#define sp_pre_dec(dec) ({                                              \
+    _register_address_increment(_regs.esp, -(dec), ctxt->sp_size/8);    \
+    _truncate_ea(_regs.esp, ctxt->sp_size/8);                           \
+})
+#define sp_post_inc(inc) ({                                             \
+    unsigned long __esp = _truncate_ea(_regs.esp, ctxt->sp_size/8);     \
+    _register_address_increment(_regs.esp, (inc), ctxt->sp_size/8);     \
+    __esp;                                                              \
+})
+
+#define jmp_rel(rel)                                                    \
+do {                                                                    \
+    _regs.eip += (int)(rel);                                            \
+    if ( !mode_64bit() )                                                \
+        _regs.eip = ((op_bytes == 2)                                    \
+                     ? (uint16_t)_regs.eip : (uint32_t)_regs.eip);      \
+} while (0)
+
+static int __handle_rep_prefix(
+    struct cpu_user_regs *int_regs,
+    struct cpu_user_regs *ext_regs,
+    int ad_bytes)
+{
+    unsigned long ecx = ((ad_bytes == 2) ? (uint16_t)int_regs->ecx :
+                         (ad_bytes == 4) ? (uint32_t)int_regs->ecx :
+                         int_regs->ecx);
+
+    if ( ecx-- == 0 )
+    {
+        ext_regs->eip = int_regs->eip;
+        return 1;
+    }
+
+    if ( ad_bytes == 2 )
+        *(uint16_t *)&int_regs->ecx = ecx;
+    else if ( ad_bytes == 4 )
+        int_regs->ecx = (uint32_t)ecx;
+    else
+        int_regs->ecx = ecx;
+    int_regs->eip = ext_regs->eip;
+    return 0;
+}
+
+#define handle_rep_prefix()                                                \
+do {                                                                       \
+    if ( rep_prefix && __handle_rep_prefix(&_regs, ctxt->regs, ad_bytes) ) \
+        goto done;                                                         \
+} while (0)
+
+/*
+ * Unsigned multiplication with double-word result.
+ * IN:  Multiplicand=m[0], Multiplier=m[1]
+ * OUT: Return CF/OF (overflow status); Result=m[1]:m[0]
+ */
+static int mul_dbl(unsigned long m[2])
+{
+    int rc;
+    asm ( "mul %4; seto %b2"
+          : "=a" (m[0]), "=d" (m[1]), "=q" (rc)
+          : "0" (m[0]), "1" (m[1]), "2" (0) );
+    return rc;
+}
+
+/*
+ * Signed multiplication with double-word result.
+ * IN:  Multiplicand=m[0], Multiplier=m[1]
+ * OUT: Return CF/OF (overflow status); Result=m[1]:m[0]
+ */
+static int imul_dbl(unsigned long m[2])
+{
+    int rc;
+    asm ( "imul %4; seto %b2"
+          : "=a" (m[0]), "=d" (m[1]), "=q" (rc)
+          : "0" (m[0]), "1" (m[1]), "2" (0) );
+    return rc;
+}
+
+/*
+ * Unsigned division of double-word dividend.
+ * IN:  Dividend=u[1]:u[0], Divisor=v
+ * OUT: Return 1: #DE
+ *      Return 0: Quotient=u[0], Remainder=u[1]
+ */
+static int div_dbl(unsigned long u[2], unsigned long v)
+{
+    if ( (v == 0) || (u[1] >= v) )
+        return 1;
+    asm ( "div %4"
+          : "=a" (u[0]), "=d" (u[1])
+          : "0" (u[0]), "1" (u[1]), "r" (v) );
+    return 0;
+}
+
+/*
+ * Signed division of double-word dividend.
+ * IN:  Dividend=u[1]:u[0], Divisor=v
+ * OUT: Return 1: #DE
+ *      Return 0: Quotient=u[0], Remainder=u[1]
+ * NB. We don't use idiv directly as it's moderately hard to work out
+ *     ahead of time whether it will #DE, which we cannot allow to happen.
+ */
+static int idiv_dbl(unsigned long u[2], unsigned long v)
+{
+    int negu = (long)u[1] < 0, negv = (long)v < 0;
+
+    /* u = abs(u) */
+    if ( negu )
+    {
+        u[1] = ~u[1];
+        if ( (u[0] = -u[0]) == 0 )
+            u[1]++;
+    }
+
+    /* abs(u) / abs(v) */
+    if ( div_dbl(u, negv ? -v : v) )
+        return 1;
+
+    /* Remainder has same sign as dividend. It cannot overflow. */
+    if ( negu )
+        u[1] = -u[1];
+
+    /* Quotient is overflowed if sign bit is set. */
+    if ( negu ^ negv )
+    {
+        if ( (long)u[0] >= 0 )
+            u[0] = -u[0];
+        else if ( (u[0] << 1) != 0 ) /* == 0x80...0 is okay */
+            return 1;
+    }
+    else if ( (long)u[0] < 0 )
+        return 1;
+
+    return 0;
+}
+
+static int
+test_cc(
+    unsigned int condition, unsigned int flags)
+{
+    int rc = 0;
+
+    switch ( (condition & 15) >> 1 )
+    {
+    case 0: /* o */
+        rc |= (flags & EFLG_OF);
+        break;
+    case 1: /* b/c/nae */
+        rc |= (flags & EFLG_CF);
+        break;
+    case 2: /* z/e */
+        rc |= (flags & EFLG_ZF);
+        break;
+    case 3: /* be/na */
+        rc |= (flags & (EFLG_CF|EFLG_ZF));
+        break;
+    case 4: /* s */
+        rc |= (flags & EFLG_SF);
+        break;
+    case 5: /* p/pe */
+        rc |= (flags & EFLG_PF);
+        break;
+    case 7: /* le/ng */
+        rc |= (flags & EFLG_ZF);
+        /* fall through */
+    case 6: /* l/nge */
+        rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+        break;
+    }
+
+    /* Odd condition identifiers (lsb == 1) have inverted sense. */
+    return (!!rc ^ (condition & 1));
+}
+
+void *
+decode_register(
+    uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs)
+{
+    void *p;
+
+    switch ( modrm_reg )
+    {
+    case  0: p = &regs->eax; break;
+    case  1: p = &regs->ecx; break;
+    case  2: p = &regs->edx; break;
+    case  3: p = &regs->ebx; break;
+    case  4: p = (highbyte_regs ?
+                  ((unsigned char *)&regs->eax + 1) : 
+                  (unsigned char *)&regs->esp); break;
+    case  5: p = (highbyte_regs ?
+                  ((unsigned char *)&regs->ecx + 1) : 
+                  (unsigned char *)&regs->ebp); break;
+    case  6: p = (highbyte_regs ?
+                  ((unsigned char *)&regs->edx + 1) : 
+                  (unsigned char *)&regs->esi); break;
+    case  7: p = (highbyte_regs ?
+                  ((unsigned char *)&regs->ebx + 1) : 
+                  (unsigned char *)&regs->edi); break;
+#if defined(__x86_64__)
+    case  8: p = &regs->r8;  break;
+    case  9: p = &regs->r9;  break;
+    case 10: p = &regs->r10; break;
+    case 11: p = &regs->r11; break;
+    case 12: p = &regs->r12; break;
+    case 13: p = &regs->r13; break;
+    case 14: p = &regs->r14; break;
+    case 15: p = &regs->r15; break;
+#endif
+    default: p = NULL; break;
+    }
+
+    return p;
+}
+
+int
+x86_emulate(
+    struct x86_emulate_ctxt *ctxt,
+    struct x86_emulate_ops  *ops)
+{
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
+
+    uint8_t b, d, sib, sib_index, sib_base, twobyte = 0, rex_prefix = 0;
+    uint8_t modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
+    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
+    unsigned int lock_prefix = 0, rep_prefix = 0;
+    int override_seg = -1, rc = X86EMUL_OKAY;
+    struct operand src, dst;
+
+    /* Data operand effective address (usually computed from ModRM). */
+    struct operand ea;
+
+    /* Default is a memory operand relative to segment DS. */
+    ea.type    = OP_MEM;
+    ea.mem.seg = x86_seg_ds;
+    ea.mem.off = 0;
+
+    op_bytes = def_op_bytes = ad_bytes = def_ad_bytes = ctxt->addr_size/8;
+    if ( op_bytes == 8 )
+    {
+        op_bytes = def_op_bytes = 4;
+#ifndef __x86_64__
+        return X86EMUL_UNHANDLEABLE;
+#endif
+    }
+
+    /* Prefix bytes. */
+    for ( ; ; )
+    {
+        switch ( b = insn_fetch_type(uint8_t) )
+        {
+        case 0x66: /* operand-size override */
+            op_bytes = def_op_bytes ^ 6;
+            break;
+        case 0x67: /* address-size override */
+            ad_bytes = def_ad_bytes ^ (mode_64bit() ? 12 : 6);
+            break;
+        case 0x2e: /* CS override */
+            override_seg = x86_seg_cs;
+            break;
+        case 0x3e: /* DS override */
+            override_seg = x86_seg_ds;
+            break;
+        case 0x26: /* ES override */
+            override_seg = x86_seg_es;
+            break;
+        case 0x64: /* FS override */
+            override_seg = x86_seg_fs;
+            break;
+        case 0x65: /* GS override */
+            override_seg = x86_seg_gs;
+            break;
+        case 0x36: /* SS override */
+            override_seg = x86_seg_ss;
+            break;
+        case 0xf0: /* LOCK */
+            lock_prefix = 1;
+            break;
+        case 0xf2: /* REPNE/REPNZ */
+        case 0xf3: /* REP/REPE/REPZ */
+            rep_prefix = 1;
+            break;
+        case 0x40 ... 0x4f: /* REX */
+            if ( !mode_64bit() )
+                goto done_prefixes;
+            rex_prefix = b;
+            continue;
+        default:
+            goto done_prefixes;
+        }
+
+        /* Any legacy prefix after a REX prefix nullifies its effect. */
+        rex_prefix = 0;
+    }
+ done_prefixes:
+
+    if ( rex_prefix & 8 ) /* REX.W */
+        op_bytes = 8;
+
+    /* Opcode byte(s). */
+    d = opcode_table[b];
+    if ( d == 0 )
+    {
+        /* Two-byte opcode? */
+        if ( b == 0x0f )
+        {
+            twobyte = 1;
+            b = insn_fetch_type(uint8_t);
+            d = twobyte_table[b];
+        }
+
+        /* Unrecognised? */
+        if ( d == 0 )
+            goto cannot_emulate;
+    }
+
+    /* Lock prefix is allowed only on RMW instructions. */
+    generate_exception_if((d & Mov) && lock_prefix, EXC_GP);
+
+    /* ModRM and SIB bytes. */
+    if ( d & ModRM )
+    {
+        modrm = insn_fetch_type(uint8_t);
+        modrm_mod = (modrm & 0xc0) >> 6;
+        modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
+        modrm_rm  = modrm & 0x07;
+
+        if ( modrm_mod == 3 )
+        {
+            modrm_rm |= (rex_prefix & 1) << 3;
+            ea.type = OP_REG;
+            ea.reg  = decode_register(
+                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
+        }
+        else if ( ad_bytes == 2 )
+        {
+            /* 16-bit ModR/M decode. */
+            switch ( modrm_rm )
+            {
+            case 0:
+                ea.mem.off = _regs.ebx + _regs.esi;
+                break;
+            case 1:
+                ea.mem.off = _regs.ebx + _regs.edi;
+                break;
+            case 2:
+                ea.mem.seg = x86_seg_ss;
+                ea.mem.off = _regs.ebp + _regs.esi;
+                break;
+            case 3:
+                ea.mem.seg = x86_seg_ss;
+                ea.mem.off = _regs.ebp + _regs.edi;
+                break;
+            case 4:
+                ea.mem.off = _regs.esi;
+                break;
+            case 5:
+                ea.mem.off = _regs.edi;
+                break;
+            case 6:
+                if ( modrm_mod == 0 )
+                    break;
+                ea.mem.seg = x86_seg_ss;
+                ea.mem.off = _regs.ebp;
+                break;
+            case 7:
+                ea.mem.off = _regs.ebx;
+                break;
+            }
+            switch ( modrm_mod )
+            {
+            case 0:
+                if ( modrm_rm == 6 )
+                    ea.mem.off = insn_fetch_type(int16_t);
+                break;
+            case 1:
+                ea.mem.off += insn_fetch_type(int8_t);
+                break;
+            case 2:
+                ea.mem.off += insn_fetch_type(int16_t);
+                break;
+            }
+            ea.mem.off = truncate_ea(ea.mem.off);
+        }
+        else
+        {
+            /* 32/64-bit ModR/M decode. */
+            if ( modrm_rm == 4 )
+            {
+                sib = insn_fetch_type(uint8_t);
+                sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
+                sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
+                if ( sib_index != 4 )
+                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                ea.mem.off <<= (sib >> 6) & 3;
+                if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
+                    ea.mem.off += insn_fetch_type(int32_t);
+                else if ( sib_base == 4 )
+                {
+                    ea.mem.seg  = x86_seg_ss;
+                    ea.mem.off += _regs.esp;
+                    if ( !twobyte && (b == 0x8f) )
+                        /* POP <rm> computes its EA post increment. */
+                        ea.mem.off += ((mode_64bit() && (op_bytes == 4))
+                                       ? 8 : op_bytes);
+                }
+                else if ( sib_base == 5 )
+                {
+                    ea.mem.seg  = x86_seg_ss;
+                    ea.mem.off += _regs.ebp;
+                }
+                else
+                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+            }
+            else
+            {
+                modrm_rm |= (rex_prefix & 1) << 3;
+                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                if ( (modrm_rm == 5) && (modrm_mod != 0) )
+                    ea.mem.seg = x86_seg_ss;
+            }
+            switch ( modrm_mod )
+            {
+            case 0:
+                if ( (modrm_rm & 7) != 5 )
+                    break;
+                ea.mem.off = insn_fetch_type(int32_t);
+                if ( !mode_64bit() )
+                    break;
+                /* Relative to RIP of next instruction. Argh! */
+                ea.mem.off += _regs.eip;
+                if ( (d & SrcMask) == SrcImm )
+                    ea.mem.off += (d & ByteOp) ? 1 :
+                        ((op_bytes == 8) ? 4 : op_bytes);
+                else if ( (d & SrcMask) == SrcImmByte )
+                    ea.mem.off += 1;
+                else if ( ((b == 0xf6) || (b == 0xf7)) &&
+                          ((modrm_reg & 7) <= 1) )
+                    /* Special case in Grp3: test has immediate operand. */
+                    ea.mem.off += (d & ByteOp) ? 1
+                        : ((op_bytes == 8) ? 4 : op_bytes);
+                break;
+            case 1:
+                ea.mem.off += insn_fetch_type(int8_t);
+                break;
+            case 2:
+                ea.mem.off += insn_fetch_type(int32_t);
+                break;
+            }
+            ea.mem.off = truncate_ea(ea.mem.off);
+        }
+    }
+
+    if ( override_seg != -1 )
+        ea.mem.seg = override_seg;
+
+    /* Special instructions do their own operand decoding. */
+    if ( (d & DstMask) == ImplicitOps )
+        goto special_insn;
+
+    /* Decode and fetch the source operand: register, memory or immediate. */
+    switch ( d & SrcMask )
+    {
+    case SrcNone:
+        break;
+    case SrcReg:
+        src.type = OP_REG;
+        if ( d & ByteOp )
+        {
+            src.reg = decode_register(modrm_reg, &_regs, (rex_prefix == 0));
+            src.val = *(uint8_t *)src.reg;
+            src.bytes = 1;
+        }
+        else
+        {
+            src.reg = decode_register(modrm_reg, &_regs, 0);
+            switch ( (src.bytes = op_bytes) )
+            {
+            case 2: src.val = *(uint16_t *)src.reg; break;
+            case 4: src.val = *(uint32_t *)src.reg; break;
+            case 8: src.val = *(uint64_t *)src.reg; break;
+            }
+        }
+        break;
+    case SrcMem16:
+        ea.bytes = 2;
+        goto srcmem_common;
+    case SrcMem:
+        ea.bytes = (d & ByteOp) ? 1 : op_bytes;
+    srcmem_common:
+        src = ea;
+        if ( src.type == OP_REG )
+        {
+            switch ( src.bytes )
+            {
+            case 1: src.val = *(uint8_t  *)src.reg; break;
+            case 2: src.val = *(uint16_t *)src.reg; break;
+            case 4: src.val = *(uint32_t *)src.reg; break;
+            case 8: src.val = *(uint64_t *)src.reg; break;
+            }
+        }
+        else if ( (rc = ops->read(src.mem.seg, src.mem.off,
+                                  &src.val, src.bytes, ctxt)) )
+            goto done;
+        break;
+    case SrcImm:
+        src.type  = OP_IMM;
+        src.bytes = (d & ByteOp) ? 1 : op_bytes;
+        if ( src.bytes == 8 ) src.bytes = 4;
+        /* NB. Immediates are sign-extended as necessary. */
+        switch ( src.bytes )
+        {
+        case 1: src.val = insn_fetch_type(int8_t);  break;
+        case 2: src.val = insn_fetch_type(int16_t); break;
+        case 4: src.val = insn_fetch_type(int32_t); break;
+        }
+        break;
+    case SrcImmByte:
+        src.type  = OP_IMM;
+        src.bytes = 1;
+        src.val   = insn_fetch_type(int8_t);
+        break;
+    }
+
+    /* Decode and fetch the destination operand: register or memory. */
+    switch ( d & DstMask )
+    {
+    case DstReg:
+        dst.type = OP_REG;
+        if ( d & ByteOp )
+        {
+            dst.reg = decode_register(modrm_reg, &_regs, (rex_prefix == 0));
+            dst.val = *(uint8_t *)dst.reg;
+            dst.bytes = 1;
+        }
+        else
+        {
+            dst.reg = decode_register(modrm_reg, &_regs, 0);
+            switch ( (dst.bytes = op_bytes) )
+            {
+            case 2: dst.val = *(uint16_t *)dst.reg; break;
+            case 4: dst.val = *(uint32_t *)dst.reg; break;
+            case 8: dst.val = *(uint64_t *)dst.reg; break;
+            }
+        }
+        break;
+    case DstBitBase:
+        if ( ((d & SrcMask) == SrcImmByte) || (ea.type == OP_REG) )
+        {
+            src.val &= (op_bytes << 3) - 1;
+        }
+        else
+        {
+            /*
+             * EA       += BitOffset DIV op_bytes*8
+             * BitOffset = BitOffset MOD op_bytes*8
+             * DIV truncates towards negative infinity.
+             * MOD always produces a positive result.
+             */
+            if ( op_bytes == 2 )
+                src.val = (int16_t)src.val;
+            else if ( op_bytes == 4 )
+                src.val = (int32_t)src.val;
+            if ( (long)src.val < 0 )
+            {
+                unsigned long byte_offset;
+                byte_offset = op_bytes + (((-src.val-1) >> 3) & ~(op_bytes-1));
+                ea.mem.off -= byte_offset;
+                src.val = (byte_offset << 3) + src.val;
+            }
+            else
+            {
+                ea.mem.off += (src.val >> 3) & ~(op_bytes - 1);
+                src.val &= (op_bytes << 3) - 1;
+            }
+        }
+        /* Becomes a normal DstMem operation from here on. */
+        d = (d & ~DstMask) | DstMem;
+    case DstMem:
+        ea.bytes = (d & ByteOp) ? 1 : op_bytes;
+        dst = ea;
+        if ( dst.type == OP_REG )
+        {
+            switch ( dst.bytes )
+            {
+            case 1: dst.val = *(uint8_t  *)dst.reg; break;
+            case 2: dst.val = *(uint16_t *)dst.reg; break;
+            case 4: dst.val = *(uint32_t *)dst.reg; break;
+            case 8: dst.val = *(uint64_t *)dst.reg; break;
+            }
+        }
+        else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
+        {
+            if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
+                                 &dst.val, dst.bytes, ctxt)) )
+                goto done;
+            dst.orig_val = dst.val;
+        }
+        break;
+    }
+
+    /* LOCK prefix allowed only on instructions with memory destination. */
+    generate_exception_if(lock_prefix && (dst.type != OP_MEM), EXC_GP);
+
+    if ( twobyte )
+        goto twobyte_insn;
+
+    switch ( b )
+    {
+    case 0x04 ... 0x05: /* add imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x00 ... 0x03: add: /* add */
+        emulate_2op_SrcV("add", src, dst, _regs.eflags);
+        break;
+
+    case 0x0c ... 0x0d: /* or imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x08 ... 0x0b: or:  /* or */
+        emulate_2op_SrcV("or", src, dst, _regs.eflags);
+        break;
+
+    case 0x14 ... 0x15: /* adc imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x10 ... 0x13: adc: /* adc */
+        emulate_2op_SrcV("adc", src, dst, _regs.eflags);
+        break;
+
+    case 0x1c ... 0x1d: /* sbb imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x18 ... 0x1b: sbb: /* sbb */
+        emulate_2op_SrcV("sbb", src, dst, _regs.eflags);
+        break;
+
+    case 0x24 ... 0x25: /* and imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x20 ... 0x23: and: /* and */
+        emulate_2op_SrcV("and", src, dst, _regs.eflags);
+        break;
+
+    case 0x2c ... 0x2d: /* sub imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x28 ... 0x2b: sub: /* sub */
+        emulate_2op_SrcV("sub", src, dst, _regs.eflags);
+        break;
+
+    case 0x34 ... 0x35: /* xor imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x30 ... 0x33: xor: /* xor */
+        emulate_2op_SrcV("xor", src, dst, _regs.eflags);
+        break;
+
+    case 0x3c ... 0x3d: /* cmp imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x38 ... 0x3b: cmp: /* cmp */
+        emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
+        break;
+
+    case 0x62: /* bound */ {
+        unsigned long src_val2;
+        int lb, ub, idx;
+        generate_exception_if(mode_64bit() || (src.type != OP_MEM), EXC_UD);
+        if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes,
+                             &src_val2, op_bytes, ctxt)) )
+            goto done;
+        ub  = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
+        lb  = (op_bytes == 2) ? (int16_t)src.val  : (int32_t)src.val;
+        idx = (op_bytes == 2) ? (int16_t)dst.val  : (int32_t)dst.val;
+        generate_exception_if((idx < lb) || (idx > ub), EXC_BR);
+        dst.type = OP_NONE;
+        break;
+    }
+
+    case 0x63: /* movsxd (x86/64) / arpl (x86/32) */
+        if ( mode_64bit() )
+        {
+            /* movsxd */
+            if ( src.type == OP_REG )
+                src.val = *(int32_t *)src.reg;
+            else if ( (rc = ops->read(src.mem.seg, src.mem.off,
+                                      &src.val, 4, ctxt)) )
+                goto done;
+            dst.val = (int32_t)src.val;
+        }
+        else
+        {
+            /* arpl */
+            uint16_t src_val = dst.val;
+            dst = src;
+            _regs.eflags &= ~EFLG_ZF;
+            _regs.eflags |= ((src_val & 3) > (dst.val & 3)) ? EFLG_ZF : 0;
+            if ( _regs.eflags & EFLG_ZF )
+                dst.val  = (dst.val & ~3) | (src_val & 3);
+            else
+                dst.type = OP_NONE;
+        }
+        break;
+
+    case 0x69: /* imul imm16/32 */
+    case 0x6b: /* imul imm8 */ {
+        unsigned long reg = *(long *)decode_register(modrm_reg, &_regs, 0);
+        _regs.eflags &= ~(EFLG_OF|EFLG_CF);
+        switch ( dst.bytes )
+        {
+        case 2:
+            dst.val = ((uint32_t)(int16_t)src.val *
+                       (uint32_t)(int16_t)reg);
+            if ( (int16_t)dst.val != (uint32_t)dst.val )
+                _regs.eflags |= EFLG_OF|EFLG_CF;
+            break;
+#ifdef __x86_64__
+        case 4:
+            dst.val = ((uint64_t)(int32_t)src.val *
+                       (uint64_t)(int32_t)reg);
+            if ( (int32_t)dst.val != dst.val )
+                _regs.eflags |= EFLG_OF|EFLG_CF;
+            break;
+#endif
+        default: {
+            unsigned long m[2] = { src.val, reg };
+            if ( imul_dbl(m) )
+                _regs.eflags |= EFLG_OF|EFLG_CF;
+            dst.val = m[0];
+            break;
+        }
+        }
+        dst.type = OP_REG;
+        dst.reg  = decode_register(modrm_reg, &_regs, 0);
+        break;
+    }
+
+    case 0x82: /* Grp1 (x86/32 only) */
+        generate_exception_if(mode_64bit(), EXC_UD);
+    case 0x80: case 0x81: case 0x83: /* Grp1 */
+        switch ( modrm_reg & 7 )
+        {
+        case 0: goto add;
+        case 1: goto or;
+        case 2: goto adc;
+        case 3: goto sbb;
+        case 4: goto and;
+        case 5: goto sub;
+        case 6: goto xor;
+        case 7: goto cmp;
+        }
+        break;
+
+    case 0xa8 ... 0xa9: /* test imm,%%eax */
+        dst.reg = (unsigned long *)&_regs.eax;
+        dst.val = _regs.eax;
+    case 0x84 ... 0x85: test: /* test */
+        emulate_2op_SrcV("test", src, dst, _regs.eflags);
+        break;
+
+    case 0x86 ... 0x87: xchg: /* xchg */
+        /* Write back the register source. */
+        switch ( dst.bytes )
+        {
+        case 1: *(uint8_t  *)src.reg = (uint8_t)dst.val; break;
+        case 2: *(uint16_t *)src.reg = (uint16_t)dst.val; break;
+        case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */
+        case 8: *src.reg = dst.val; break;
+        }
+        /* Write back the memory destination with implicit LOCK prefix. */
+        dst.val = src.val;
+        lock_prefix = 1;
+        break;
+
+    case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
+        generate_exception_if((modrm_reg & 7) != 0, EXC_UD);
+    case 0x88 ... 0x8b: /* mov */
+        dst.val = src.val;
+        break;
+
+    case 0x8d: /* lea */
+        dst.val = ea.mem.off;
+        break;
+
+    case 0x8f: /* pop (sole member of Grp1a) */
+        generate_exception_if((modrm_reg & 7) != 0, EXC_UD);
+        /* 64-bit mode: POP defaults to a 64-bit operand. */
+        if ( mode_64bit() && (dst.bytes == 4) )
+            dst.bytes = 8;
+        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
+                             &dst.val, dst.bytes, ctxt)) != 0 )
+            goto done;
+        break;
+
+    case 0xb0 ... 0xb7: /* mov imm8,r8 */
+        dst.reg = decode_register(
+            (b & 7) | ((rex_prefix & 1) << 3), &_regs, (rex_prefix == 0));
+        dst.val = src.val;
+        break;
+
+    case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
+        if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */
+            src.val = ((uint32_t)src.val |
+                       ((uint64_t)insn_fetch_type(uint32_t) << 32));
+        dst.reg = decode_register(
+            (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
+        dst.val = src.val;
+        break;
+
+    case 0xc0 ... 0xc1: grp2: /* Grp2 */
+        switch ( modrm_reg & 7 )
+        {
+        case 0: /* rol */
+            emulate_2op_SrcB("rol", src, dst, _regs.eflags);
+            break;
+        case 1: /* ror */
+            emulate_2op_SrcB("ror", src, dst, _regs.eflags);
+            break;
+        case 2: /* rcl */
+            emulate_2op_SrcB("rcl", src, dst, _regs.eflags);
+            break;
+        case 3: /* rcr */
+            emulate_2op_SrcB("rcr", src, dst, _regs.eflags);
+            break;
+        case 4: /* sal/shl */
+        case 6: /* sal/shl */
+            emulate_2op_SrcB("sal", src, dst, _regs.eflags);
+            break;
+        case 5: /* shr */
+            emulate_2op_SrcB("shr", src, dst, _regs.eflags);
+            break;
+        case 7: /* sar */
+            emulate_2op_SrcB("sar", src, dst, _regs.eflags);
+            break;
+        }
+        break;
+
+    case 0xd0 ... 0xd1: /* Grp2 */
+        src.val = 1;
+        goto grp2;
+
+    case 0xd2 ... 0xd3: /* Grp2 */
+        src.val = _regs.ecx;
+        goto grp2;
+
+    case 0xf6 ... 0xf7: /* Grp3 */
+        switch ( modrm_reg & 7 )
+        {
+        case 0 ... 1: /* test */
+            /* Special case in Grp3: test has an immediate source operand. */
+            src.type = OP_IMM;
+            src.bytes = (d & ByteOp) ? 1 : op_bytes;
+            if ( src.bytes == 8 ) src.bytes = 4;
+            switch ( src.bytes )
+            {
+            case 1: src.val = insn_fetch_type(int8_t);  break;
+            case 2: src.val = insn_fetch_type(int16_t); break;
+            case 4: src.val = insn_fetch_type(int32_t); break;
+            }
+            goto test;
+        case 2: /* not */
+            dst.val = ~dst.val;
+            break;
+        case 3: /* neg */
+            emulate_1op("neg", dst, _regs.eflags);
+            break;
+        case 4: /* mul */
+            src = dst;
+            dst.type = OP_REG;
+            dst.reg  = (unsigned long *)&_regs.eax;
+            dst.val  = *dst.reg;
+            _regs.eflags &= ~(EFLG_OF|EFLG_CF);
+            switch ( src.bytes )
+            {
+            case 1:
+                dst.val *= src.val;
+                if ( (uint8_t)dst.val != (uint16_t)dst.val )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                break;
+            case 2:
+                dst.val *= src.val;
+                if ( (uint16_t)dst.val != (uint32_t)dst.val )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                *(uint16_t *)&_regs.edx = dst.val >> 16;
+                break;
+#ifdef __x86_64__
+            case 4:
+                dst.val *= src.val;
+                if ( (uint32_t)dst.val != dst.val )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                _regs.edx = (uint32_t)(dst.val >> 32);
+                break;
+#endif
+            default: {
+                unsigned long m[2] = { src.val, dst.val };
+                if ( mul_dbl(m) )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                _regs.edx = m[1];
+                dst.val  = m[0];
+                break;
+            }
+            }
+            break;
+        case 5: /* imul */
+            src = dst;
+            dst.type = OP_REG;
+            dst.reg  = (unsigned long *)&_regs.eax;
+            dst.val  = *dst.reg;
+            _regs.eflags &= ~(EFLG_OF|EFLG_CF);
+            switch ( src.bytes )
+            {
+            case 1:
+                dst.val = ((uint16_t)(int8_t)src.val *
+                           (uint16_t)(int8_t)dst.val);
+                if ( (int8_t)dst.val != (uint16_t)dst.val )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                break;
+            case 2:
+                dst.val = ((uint32_t)(int16_t)src.val *
+                           (uint32_t)(int16_t)dst.val);
+                if ( (int16_t)dst.val != (uint32_t)dst.val )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                *(uint16_t *)&_regs.edx = dst.val >> 16;
+                break;
+#ifdef __x86_64__
+            case 4:
+                dst.val = ((uint64_t)(int32_t)src.val *
+                           (uint64_t)(int32_t)dst.val);
+                if ( (int32_t)dst.val != dst.val )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                _regs.edx = (uint32_t)(dst.val >> 32);
+                break;
+#endif
+            default: {
+                unsigned long m[2] = { src.val, dst.val };
+                if ( imul_dbl(m) )
+                    _regs.eflags |= EFLG_OF|EFLG_CF;
+                _regs.edx = m[1];
+                dst.val  = m[0];
+                break;
+            }
+            }
+            break;
+        case 6: /* div */ {
+            unsigned long u[2], v;
+            src = dst;
+            dst.type = OP_REG;
+            dst.reg  = (unsigned long *)&_regs.eax;
+            switch ( src.bytes )
+            {
+            case 1:
+                u[0] = (uint16_t)_regs.eax;
+                u[1] = 0;
+                v    = (uint8_t)src.val;
+                generate_exception_if(
+                    div_dbl(u, v) || ((uint8_t)u[0] != (uint16_t)u[0]),
+                    EXC_DE);
+                dst.val = (uint8_t)u[0];
+                ((uint8_t *)&_regs.eax)[1] = u[1];
+                break;
+            case 2:
+                u[0] = ((uint32_t)_regs.edx << 16) | (uint16_t)_regs.eax;
+                u[1] = 0;
+                v    = (uint16_t)src.val;
+                generate_exception_if(
+                    div_dbl(u, v) || ((uint16_t)u[0] != (uint32_t)u[0]),
+                    EXC_DE);
+                dst.val = (uint16_t)u[0];
+                *(uint16_t *)&_regs.edx = u[1];
+                break;
+#ifdef __x86_64__
+            case 4:
+                u[0] = (_regs.edx << 32) | (uint32_t)_regs.eax;
+                u[1] = 0;
+                v    = (uint32_t)src.val;
+                generate_exception_if(
+                    div_dbl(u, v) || ((uint32_t)u[0] != u[0]),
+                    EXC_DE);
+                dst.val   = (uint32_t)u[0];
+                _regs.edx = (uint32_t)u[1];
+                break;
+#endif
+            default:
+                u[0] = _regs.eax;
+                u[1] = _regs.edx;
+                v    = src.val;
+                generate_exception_if(div_dbl(u, v), EXC_DE);
+                dst.val   = u[0];
+                _regs.edx = u[1];
+                break;
+            }
+            break;
+        }
+        case 7: /* idiv */ {
+            unsigned long u[2], v;
+            src = dst;
+            dst.type = OP_REG;
+            dst.reg  = (unsigned long *)&_regs.eax;
+            switch ( src.bytes )
+            {
+            case 1:
+                u[0] = (int16_t)_regs.eax;
+                u[1] = ((long)u[0] < 0) ? ~0UL : 0UL;
+                v    = (int8_t)src.val;
+                generate_exception_if(
+                    idiv_dbl(u, v) || ((int8_t)u[0] != (int16_t)u[0]),
+                    EXC_DE);
+                dst.val = (int8_t)u[0];
+                ((int8_t *)&_regs.eax)[1] = u[1];
+                break;
+            case 2:
+                u[0] = (int32_t)((_regs.edx << 16) | (uint16_t)_regs.eax);
+                u[1] = ((long)u[0] < 0) ? ~0UL : 0UL;
+                v    = (int16_t)src.val;
+                generate_exception_if(
+                    idiv_dbl(u, v) || ((int16_t)u[0] != (int32_t)u[0]),
+                    EXC_DE);
+                dst.val = (int16_t)u[0];
+                *(int16_t *)&_regs.edx = u[1];
+                break;
+#ifdef __x86_64__
+            case 4:
+                u[0] = (_regs.edx << 32) | (uint32_t)_regs.eax;
+                u[1] = ((long)u[0] < 0) ? ~0UL : 0UL;
+                v    = (int32_t)src.val;
+                generate_exception_if(
+                    idiv_dbl(u, v) || ((int32_t)u[0] != u[0]),
+                    EXC_DE);
+                dst.val   = (int32_t)u[0];
+                _regs.edx = (uint32_t)u[1];
+                break;
+#endif
+            default:
+                u[0] = _regs.eax;
+                u[1] = _regs.edx;
+                v    = src.val;
+                generate_exception_if(idiv_dbl(u, v), EXC_DE);
+                dst.val   = u[0];
+                _regs.edx = u[1];
+                break;
+            }
+            break;
+        }
+        default:
+            goto cannot_emulate;
+        }
+        break;
+
+    case 0xfe: /* Grp4 */
+        generate_exception_if((modrm_reg & 7) >= 2, EXC_UD);
+    case 0xff: /* Grp5 */
+        switch ( modrm_reg & 7 )
+        {
+        case 0: /* inc */
+            emulate_1op("inc", dst, _regs.eflags);
+            break;
+        case 1: /* dec */
+            emulate_1op("dec", dst, _regs.eflags);
+            break;
+        case 2: /* call (near) */
+        case 4: /* jmp (near) */
+            if ( ((op_bytes = dst.bytes) != 8) && mode_64bit() )
+            {
+                dst.bytes = op_bytes = 8;
+                if ( dst.type == OP_REG )
+                    dst.val = *dst.reg;
+                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
+                                          &dst.val, 8, ctxt)) != 0 )
+                    goto done;
+            }
+            src.val = _regs.eip;
+            _regs.eip = dst.val;
+            if ( (modrm_reg & 7) == 2 )
+                goto push; /* call */
+            break;
+        case 6: /* push */
+            /* 64-bit mode: PUSH defaults to a 64-bit operand. */
+            if ( mode_64bit() && (dst.bytes == 4) )
+            {
+                dst.bytes = 8;
+                if ( dst.type == OP_REG )
+                    dst.val = *dst.reg;
+                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
+                                          &dst.val, 8, ctxt)) != 0 )
+                    goto done;
+            }
+            if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
+                                  dst.val, dst.bytes, ctxt)) != 0 )
+                goto done;
+            dst.type = OP_NONE;
+            break;
+        case 7:
+            generate_exception_if(1, EXC_UD);
+        default:
+            goto cannot_emulate;
+        }
+        break;
+    }
+
+ writeback:
+    switch ( dst.type )
+    {
+    case OP_REG:
+        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+        switch ( dst.bytes )
+        {
+        case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
+        case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
+        case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
+        case 8: *dst.reg = dst.val; break;
+        }
+        break;
+    case OP_MEM:
+        if ( !(d & Mov) && (dst.orig_val == dst.val) )
+            /* nothing to do */;
+        else if ( lock_prefix )
+            rc = ops->cmpxchg(
+                dst.mem.seg, dst.mem.off, dst.orig_val,
+                dst.val, dst.bytes, ctxt);
+        else
+            rc = ops->write(
+                dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt);
+        if ( rc != 0 )
+            goto done;
+    default:
+        break;
+    }
+
+    /* Commit shadow register state. */
+    _regs.eflags &= ~EF_RF;
+    *ctxt->regs = _regs;
+
+ done:
+    return rc;
+
+ special_insn:
+    dst.type = OP_NONE;
+
+    /*
+     * The only implicit-operands instructions allowed a LOCK prefix are
+     * CMPXCHG{8,16}B, MOV CRn, MOV DRn.
+     */
+    generate_exception_if(lock_prefix &&
+                          ((b < 0x20) || (b > 0x23)) && /* MOV CRn/DRn */
+                          (b != 0xc7),                  /* CMPXCHG{8,16}B */
+                          EXC_GP);
+
+    if ( twobyte )
+        goto twobyte_special_insn;
+
+    switch ( b )
+    {
+    case 0x27: /* daa */ {
+        uint8_t al = _regs.eax;
+        unsigned long eflags = _regs.eflags;
+        generate_exception_if(mode_64bit(), EXC_UD);
+        _regs.eflags &= ~(EFLG_CF|EFLG_AF);
+        if ( ((al & 0x0f) > 9) || (eflags & EFLG_AF) )
+        {
+            *(uint8_t *)&_regs.eax += 6;
+            _regs.eflags |= EFLG_AF;
+        }
+        if ( (al > 0x99) || (eflags & EFLG_CF) )
+        {
+            *(uint8_t *)&_regs.eax += 0x60;
+            _regs.eflags |= EFLG_CF;
+        }
+        _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF);
+        _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0;
+        _regs.eflags |= (( int8_t)_regs.eax <  0) ? EFLG_SF : 0;
+        _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0;
+        break;
+    }
+
+    case 0x2f: /* das */ {
+        uint8_t al = _regs.eax;
+        unsigned long eflags = _regs.eflags;
+        generate_exception_if(mode_64bit(), EXC_UD);
+        _regs.eflags &= ~(EFLG_CF|EFLG_AF);
+        if ( ((al & 0x0f) > 9) || (eflags & EFLG_AF) )
+        {
+            _regs.eflags |= EFLG_AF;
+            if ( (al < 6) || (eflags & EFLG_CF) )
+                _regs.eflags |= EFLG_CF;
+            *(uint8_t *)&_regs.eax -= 6;
+        }
+        if ( (al > 0x99) || (eflags & EFLG_CF) )
+        {
+            *(uint8_t *)&_regs.eax -= 0x60;
+            _regs.eflags |= EFLG_CF;
+        }
+        _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF);
+        _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0;
+        _regs.eflags |= (( int8_t)_regs.eax <  0) ? EFLG_SF : 0;
+        _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0;
+        break;
+    }
+
+    case 0x37: /* aaa */
+    case 0x3f: /* aas */
+        generate_exception_if(mode_64bit(), EXC_UD);
+        _regs.eflags &= ~EFLG_CF;
+        if ( ((uint8_t)_regs.eax > 9) || (_regs.eflags & EFLG_AF) )
+        {
+            ((uint8_t *)&_regs.eax)[0] += (b == 0x37) ? 6 : -6;
+            ((uint8_t *)&_regs.eax)[1] += (b == 0x37) ? 1 : -1;
+            _regs.eflags |= EFLG_CF | EFLG_AF;
+        }
+        ((uint8_t *)&_regs.eax)[0] &= 0x0f;
+        break;
+
+    case 0x40 ... 0x4f: /* inc/dec reg */
+        dst.type  = OP_REG;
+        dst.reg   = decode_register(b & 7, &_regs, 0);
+        dst.bytes = op_bytes;
+        dst.val   = *dst.reg;
+        if ( b & 8 )
+            emulate_1op("dec", dst, _regs.eflags);
+        else
+            emulate_1op("inc", dst, _regs.eflags);
+        break;
+
+    case 0x50 ... 0x57: /* push reg */
+        src.val = *(unsigned long *)decode_register(
+            (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
+        goto push;
+
+    case 0x58 ... 0x5f: /* pop reg */
+        dst.type  = OP_REG;
+        dst.reg   = decode_register(
+            (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
+        dst.bytes = op_bytes;
+        if ( mode_64bit() && (dst.bytes == 4) )
+            dst.bytes = 8;
+        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
+                             &dst.val, dst.bytes, ctxt)) != 0 )
+            goto done;
+        break;
+
+    case 0x60: /* pusha */ {
+        int i;
+        unsigned long regs[] = {
+            _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
+            _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+        generate_exception_if(mode_64bit(), EXC_UD);
+        for ( i = 0; i < 8; i++ )
+            if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
+                                  regs[i], op_bytes, ctxt)) != 0 )
+            goto done;
+        break;
+    }
+
+    case 0x61: /* popa */ {
+        int i;
+        unsigned long dummy_esp, *regs[] = {
+            (unsigned long *)&_regs.edi, (unsigned long *)&_regs.esi,
+            (unsigned long *)&_regs.ebp, (unsigned long *)&dummy_esp,
+            (unsigned long *)&_regs.ebx, (unsigned long *)&_regs.edx,
+            (unsigned long *)&_regs.ecx, (unsigned long *)&_regs.eax };
+        generate_exception_if(mode_64bit(), EXC_UD);
+        for ( i = 0; i < 8; i++ )
+            if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
+                                 regs[i], op_bytes, ctxt)) != 0 )
+            goto done;
+        break;
+    }
+
+    case 0x68: /* push imm{16,32,64} */
+        src.val = ((op_bytes == 2)
+                   ? (int32_t)insn_fetch_type(int16_t)
+                   : insn_fetch_type(int32_t));
+        goto push;
+
+    case 0x6a: /* push imm8 */
+        src.val = insn_fetch_type(int8_t);
+    push:
+        d |= Mov; /* force writeback */
+        dst.type  = OP_MEM;
+        dst.bytes = op_bytes;
+        if ( mode_64bit() && (dst.bytes == 4) )
+            dst.bytes = 8;
+        dst.val = src.val;
+        dst.mem.seg = x86_seg_ss;
+        dst.mem.off = sp_pre_dec(dst.bytes);
+        break;
+
+    case 0x6c ... 0x6d: /* ins %dx,%es:%edi */
+        handle_rep_prefix();
+        generate_exception_if(!mode_iopl(), EXC_GP);
+        dst.type  = OP_MEM;
+        dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes;
+        dst.mem.seg = x86_seg_es;
+        dst.mem.off = truncate_ea(_regs.edi);
+        fail_if(ops->read_io == NULL);
+        if ( (rc = ops->read_io((uint16_t)_regs.edx, dst.bytes,
+                                &dst.val, ctxt)) != 0 )
+            goto done;
+        register_address_increment(
+            _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+        break;
+
+    case 0x6e ... 0x6f: /* outs %esi,%dx */
+        handle_rep_prefix();
+        generate_exception_if(!mode_iopl(), EXC_GP);
+        dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes;
+        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
+                             &dst.val, dst.bytes, ctxt)) != 0 )
+            goto done;
+        fail_if(ops->write_io == NULL);
+        if ( (rc = ops->write_io((uint16_t)_regs.edx, dst.bytes,
+                                 dst.val, ctxt)) != 0 )
+            goto done;
+        register_address_increment(
+            _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+        break;
+
+    case 0x70 ... 0x7f: /* jcc (short) */ {
+        int rel = insn_fetch_type(int8_t);
+        if ( test_cc(b, _regs.eflags) )
+            jmp_rel(rel);
+        break;
+    }
+
+    case 0x90: /* nop / xchg %%r8,%%rax */
+        if ( !(rex_prefix & 1) )
+            break; /* nop */
+
+    case 0x91 ... 0x97: /* xchg reg,%%rax */
+        src.type = dst.type = OP_REG;
+        src.bytes = dst.bytes = op_bytes;
+        src.reg  = (unsigned long *)&_regs.eax;
+        src.val  = *src.reg;
+        dst.reg  = decode_register(
+            (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
+        dst.val  = *dst.reg;
+        goto xchg;
+
+    case 0x98: /* cbw/cwde/cdqe */
+        switch ( op_bytes )
+        {
+        case 2: *(int16_t *)&_regs.eax = (int8_t)_regs.eax; break; /* cbw */
+        case 4: _regs.eax = (uint32_t)(int16_t)_regs.eax; break; /* cwde */
+        case 8: _regs.eax = (int32_t)_regs.eax; break; /* cdqe */
+        }
+        break;
+
+    case 0x99: /* cwd/cdq/cqo */
+        switch ( op_bytes )
+        {
+        case 2:
+            *(int16_t *)&_regs.edx = ((int16_t)_regs.eax < 0) ? -1 : 0;
+            break;
+        case 4:
+            _regs.edx = (uint32_t)(((int32_t)_regs.eax < 0) ? -1 : 0);
+            break;
+        case 8:
+            _regs.edx = (_regs.eax < 0) ? -1 : 0;
+            break;
+        }
+        break;
+
+    case 0x9e: /* sahf */
+        *(uint8_t *)_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02;
+        break;
+
+    case 0x9f: /* lahf */
+        ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02;
+        break;
+
+    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+        /* Source EA is not encoded via ModRM. */
+        dst.type  = OP_REG;
+        dst.reg   = (unsigned long *)&_regs.eax;
+        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+        if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes),
+                             &dst.val, dst.bytes, ctxt)) != 0 )
+            goto done;
+        break;
+
+    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
+        /* Destination EA is not encoded via ModRM. */
+        dst.type  = OP_MEM;
+        dst.mem.seg = ea.mem.seg;
+        dst.mem.off = insn_fetch_bytes(ad_bytes);
+        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+        dst.val   = (unsigned long)_regs.eax;
+        break;
+
+    case 0xa4 ... 0xa5: /* movs */
+        handle_rep_prefix();
+        dst.type  = OP_MEM;
+        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+        dst.mem.seg = x86_seg_es;
+        dst.mem.off = truncate_ea(_regs.edi);
+        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
+                             &dst.val, dst.bytes, ctxt)) != 0 )
+            goto done;
+        register_address_increment(
+            _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+        register_address_increment(
+            _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+        break;
+
+    case 0xaa ... 0xab: /* stos */
+        handle_rep_prefix();
+        dst.type  = OP_MEM;
+        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+        dst.mem.seg = x86_seg_es;
+        dst.mem.off = truncate_ea(_regs.edi);
+        dst.val   = _regs.eax;
+        register_address_increment(
+            _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+        break;
+
+    case 0xac ... 0xad: /* lods */
+        handle_rep_prefix();
+        dst.type  = OP_REG;
+        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+        dst.reg   = (unsigned long *)&_regs.eax;
+        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
+                             &dst.val, dst.bytes, ctxt)) != 0 )
+            goto done;
+        register_address_increment(
+            _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+        break;
+
+    case 0xc2: /* ret imm16 (near) */
+    case 0xc3: /* ret (near) */ {
+        int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0;
+        op_bytes = mode_64bit() ? 8 : op_bytes;
+        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
+                             &dst.val, op_bytes, ctxt)) != 0 )
+            goto done;
+        _regs.eip = dst.val;
+        break;
+    }
+
+    case 0xd4: /* aam */ {
+        unsigned int base = insn_fetch_type(uint8_t);
+        uint8_t al = _regs.eax;
+        generate_exception_if(mode_64bit(), EXC_UD);
+        generate_exception_if(base == 0, EXC_DE);
+        *(uint16_t *)&_regs.eax = ((al / base) << 8) | (al % base);
+        _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF);
+        _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0;
+        _regs.eflags |= (( int8_t)_regs.eax <  0) ? EFLG_SF : 0;
+        _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0;
+        break;
+    }
+
+    case 0xd5: /* aad */ {
+        unsigned int base = insn_fetch_type(uint8_t);
+        uint16_t ax = _regs.eax;
+        generate_exception_if(mode_64bit(), EXC_UD);
+        *(uint16_t *)&_regs.eax = (uint8_t)(ax + ((ax >> 8) * base));
+        _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF);
+        _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0;
+        _regs.eflags |= (( int8_t)_regs.eax <  0) ? EFLG_SF : 0;
+        _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0;
+        break;
+    }
+
+    case 0xd6: /* salc */
+        generate_exception_if(mode_64bit(), EXC_UD);
+        *(uint8_t *)&_regs.eax = (_regs.eflags & EFLG_CF) ? 0xff : 0x00;
+        break;
+
+    case 0xd7: /* xlat */ {
+        unsigned long al = (uint8_t)_regs.eax;
+        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al),
+                             &al, 1, ctxt)) != 0 )
+            goto done;
+        *(uint8_t *)&_regs.eax = al;
+        break;
+    }
+
+    case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
+        int rel = insn_fetch_type(int8_t);
+        int do_jmp = !(_regs.eflags & EFLG_ZF); /* loopnz */
+        if ( b == 0xe1 )
+            do_jmp = !do_jmp; /* loopz */
+        else if ( b == 0xe2 )
+            do_jmp = 1; /* loop */
+        switch ( ad_bytes )
+        {
+        case 2:
+            do_jmp &= --(*(uint16_t *)&_regs.ecx) != 0;
+            break;
+        case 4:
+            do_jmp &= --(*(uint32_t *)&_regs.ecx) != 0;
+            _regs.ecx = (uint32_t)_regs.ecx; /* zero extend in x86/64 mode */
+            break;
+        default: /* case 8: */
+            do_jmp &= --_regs.ecx != 0;
+            break;
+        }
+        if ( do_jmp )
+            jmp_rel(rel);
+        break;
+    }
+
+    case 0xe3: /* jcxz/jecxz (short) */ {
+        int rel = insn_fetch_type(int8_t);
+        if ( (ad_bytes == 2) ? !(uint16_t)_regs.ecx :
+             (ad_bytes == 4) ? !(uint32_t)_regs.ecx : !_regs.ecx )
+            jmp_rel(rel);
+        break;
+    }
+
+    case 0xe4: /* in imm8,%al */
+    case 0xe5: /* in imm8,%eax */
+    case 0xe6: /* out %al,imm8 */
+    case 0xe7: /* out %eax,imm8 */
+    case 0xec: /* in %dx,%al */
+    case 0xed: /* in %dx,%eax */
+    case 0xee: /* out %al,%dx */
+    case 0xef: /* out %eax,%dx */ {
+        unsigned int port = ((b < 0xe8)
+                             ? insn_fetch_type(uint8_t)
+                             : (uint16_t)_regs.edx);
+        generate_exception_if(!mode_iopl(), EXC_GP);
+        op_bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes;
+        if ( b & 2 )
+        {
+            /* out */
+            fail_if(ops->write_io == NULL);
+            rc = ops->write_io(port, op_bytes, _regs.eax, ctxt);
+            
+        }
+        else
+        {
+            /* in */
+            dst.type  = OP_REG;
+            dst.bytes = op_bytes;
+            dst.reg   = (unsigned long *)&_regs.eax;
+            fail_if(ops->read_io == NULL);
+            rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+        }
+        if ( rc != 0 )
+            goto done;
+        break;
+    }
+
+    case 0xe8: /* call (near) */ {
+        int rel = (((op_bytes == 2) && !mode_64bit())
+                   ? (int32_t)insn_fetch_type(int16_t)
+                   : insn_fetch_type(int32_t));
+        op_bytes = mode_64bit() ? 8 : op_bytes;
+        src.val = _regs.eip;
+        jmp_rel(rel);
+        goto push;
+    }
+
+    case 0xe9: /* jmp (near) */ {
+        int rel = (((op_bytes == 2) && !mode_64bit())
+                   ? (int32_t)insn_fetch_type(int16_t)
+                   : insn_fetch_type(int32_t));
+        jmp_rel(rel);
+        break;
+    }
+
+    case 0xeb: /* jmp (short) */
+        jmp_rel(insn_fetch_type(int8_t));
+        break;
+
+    case 0xf5: /* cmc */
+        _regs.eflags ^= EFLG_CF;
+        break;
+
+    case 0xf8: /* clc */
+        _regs.eflags &= ~EFLG_CF;
+        break;
+
+    case 0xf9: /* stc */
+        _regs.eflags |= EFLG_CF;
+        break;
+
+    case 0xfa: /* cli */
+        generate_exception_if(!mode_iopl(), EXC_GP);
+        fail_if(ops->write_rflags == NULL);
+        if ( (rc = ops->write_rflags(_regs.eflags & ~EFLG_IF, ctxt)) != 0 )
+            goto done;
+        break;
+
+    case 0xfb: /* sti */
+        generate_exception_if(!mode_iopl(), EXC_GP);
+        fail_if(ops->write_rflags == NULL);
+        if ( (rc = ops->write_rflags(_regs.eflags | EFLG_IF, ctxt)) != 0 )
+            goto done;
+        break;
+
+    case 0xfc: /* cld */
+        _regs.eflags &= ~EFLG_DF;
+        break;
+
+    case 0xfd: /* std */
+        _regs.eflags |= EFLG_DF;
+        break;
+    }
+    goto writeback;
+
+ twobyte_insn:
+    switch ( b )
+    {
+    case 0x40 ... 0x4f: /* cmovcc */
+        dst.val = src.val;
+        if ( !test_cc(b, _regs.eflags) )
+            dst.type = OP_NONE;
+        break;
+
+    case 0x90 ... 0x9f: /* setcc */
+        dst.val = test_cc(b, _regs.eflags);
+        break;
+
+    case 0xb0 ... 0xb1: /* cmpxchg */
+        /* Save real source value, then compare EAX against destination. */
+        src.orig_val = src.val;
+        src.val = _regs.eax;
+        emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
+        /* Always write back. The question is: where to? */
+        d |= Mov;
+        if ( _regs.eflags & EFLG_ZF )
+        {
+            /* Success: write back to memory. */
+            dst.val = src.orig_val;
+        }
+        else
+        {
+            /* Failure: write the value we saw to EAX. */
+            dst.type = OP_REG;
+            dst.reg  = (unsigned long *)&_regs.eax;
+        }
+        break;
+
+    case 0xa3: bt: /* bt */
+        emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags);
+        break;
+
+    case 0xb3: btr: /* btr */
+        emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
+        break;
+
+    case 0xab: bts: /* bts */
+        emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
+        break;
+
+    case 0xaf: /* imul */
+        _regs.eflags &= ~(EFLG_OF|EFLG_CF);
+        switch ( dst.bytes )
+        {
+        case 2:
+            dst.val = ((uint32_t)(int16_t)src.val *
+                       (uint32_t)(int16_t)dst.val);
+            if ( (int16_t)dst.val != (uint32_t)dst.val )
+                _regs.eflags |= EFLG_OF|EFLG_CF;
+            break;
+#ifdef __x86_64__
+        case 4:
+            dst.val = ((uint64_t)(int32_t)src.val *
+                       (uint64_t)(int32_t)dst.val);
+            if ( (int32_t)dst.val != dst.val )
+                _regs.eflags |= EFLG_OF|EFLG_CF;
+            break;
+#endif
+        default: {
+            unsigned long m[2] = { src.val, dst.val };
+            if ( imul_dbl(m) )
+                _regs.eflags |= EFLG_OF|EFLG_CF;
+            dst.val = m[0];
+            break;
+        }
+        }
+        break;
+
+    case 0xb6: /* movzx rm8,r{16,32,64} */
+        /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
+        dst.reg   = decode_register(modrm_reg, &_regs, 0);
+        dst.bytes = op_bytes;
+        dst.val   = (uint8_t)src.val;
+        break;
+
+    case 0xbc: /* bsf */ {
+        int zf;
+        asm ( "bsf %2,%0; setz %b1"
+              : "=r" (dst.val), "=q" (zf)
+              : "r" (src.val), "1" (0) );
+        _regs.eflags &= ~EFLG_ZF;
+        _regs.eflags |= zf ? EFLG_ZF : 0;
+        break;
+    }
+
+    case 0xbd: /* bsr */ {
+        int zf;
+        asm ( "bsr %2,%0; setz %b1"
+              : "=r" (dst.val), "=q" (zf)
+              : "r" (src.val), "1" (0) );
+        _regs.eflags &= ~EFLG_ZF;
+        _regs.eflags |= zf ? EFLG_ZF : 0;
+        break;
+    }
+
+    case 0xb7: /* movzx rm16,r{16,32,64} */
+        dst.val = (uint16_t)src.val;
+        break;
+
+    case 0xbb: btc: /* btc */
+        emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
+        break;
+
+    case 0xba: /* Grp8 */
+        switch ( modrm_reg & 7 )
+        {
+        case 4: goto bt;
+        case 5: goto bts;
+        case 6: goto btr;
+        case 7: goto btc;
+        default: generate_exception_if(1, EXC_UD);
+        }
+        break;
+
+    case 0xbe: /* movsx rm8,r{16,32,64} */
+        /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
+        dst.reg   = decode_register(modrm_reg, &_regs, 0);
+        dst.bytes = op_bytes;
+        dst.val   = (int8_t)src.val;
+        break;
+
+    case 0xbf: /* movsx rm16,r{16,32,64} */
+        dst.val = (int16_t)src.val;
+        break;
+
+    case 0xc0 ... 0xc1: /* xadd */
+        /* Write back the register source. */
+        switch ( dst.bytes )
+        {
+        case 1: *(uint8_t  *)src.reg = (uint8_t)dst.val; break;
+        case 2: *(uint16_t *)src.reg = (uint16_t)dst.val; break;
+        case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */
+        case 8: *src.reg = dst.val; break;
+        }
+        goto add;
+    }
+    goto writeback;
+
+ twobyte_special_insn:
+    switch ( b )
+    {
+    case 0x06: /* clts */
+        generate_exception_if(!mode_ring0(), EXC_GP);
+        fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL));
+        if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ||
+             (rc = ops->write_cr(0, dst.val&~8, ctxt)) )
+            goto done;
+        break;
+
+    case 0x08: /* invd */
+    case 0x09: /* wbinvd */
+        generate_exception_if(!mode_ring0(), EXC_GP);
+        fail_if(ops->wbinvd == NULL);
+        if ( (rc = ops->wbinvd(ctxt)) != 0 )
+            goto done;
+        break;
+
+    case 0x0d: /* GrpP (prefetch) */
+    case 0x18: /* Grp16 (prefetch/nop) */
+    case 0x19 ... 0x1f: /* nop (amd-defined) */
+        break;
+
+    case 0x20: /* mov cr,reg */
+    case 0x21: /* mov dr,reg */
+    case 0x22: /* mov reg,cr */
+    case 0x23: /* mov reg,dr */
+        generate_exception_if(!mode_ring0(), EXC_GP);
+        modrm_rm  |= (rex_prefix & 1) << 3;
+        modrm_reg |= lock_prefix << 3;
+        if ( b & 2 )
+        {
+            /* Write to CR/DR. */
+            src.val = *(unsigned long *)decode_register(modrm_rm, &_regs, 0);
+            if ( !mode_64bit() )
+                src.val = (uint32_t)src.val;
+            rc = ((b & 1)
+                  ? (ops->write_dr
+                     ? ops->write_dr(modrm_reg, src.val, ctxt)
+                     : X86EMUL_UNHANDLEABLE)
+                  : (ops->write_cr
+                     ? ops->write_dr(modrm_reg, src.val, ctxt)
+                     : X86EMUL_UNHANDLEABLE));
+        }
+        else
+        {
+            /* Read from CR/DR. */
+            dst.type  = OP_REG;
+            dst.bytes = mode_64bit() ? 8 : 4;
+            dst.reg   = decode_register(modrm_rm, &_regs, 0);
+            rc = ((b & 1)
+                  ? (ops->read_dr
+                     ? ops->read_dr(modrm_reg, &dst.val, ctxt)
+                     : X86EMUL_UNHANDLEABLE)
+                  : (ops->read_cr
+                     ? ops->read_dr(modrm_reg, &dst.val, ctxt)
+                     : X86EMUL_UNHANDLEABLE));
+        }
+        if ( rc != 0 )
+            goto done;
+        break;
+
+    case 0x30: /* wrmsr */ {
+        uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax;
+        generate_exception_if(!mode_ring0(), EXC_GP);
+        fail_if(ops->write_msr == NULL);
+        if ( (rc = ops->write_msr((uint32_t)_regs.ecx, val, ctxt)) != 0 )
+            goto done;
+        break;
+    }
+
+    case 0x32: /* rdmsr */ {
+        uint64_t val;
+        generate_exception_if(!mode_ring0(), EXC_GP);
+        fail_if(ops->read_msr == NULL);
+        if ( (rc = ops->read_msr((uint32_t)_regs.ecx, &val, ctxt)) != 0 )
+            goto done;
+        _regs.edx = (uint32_t)(val >> 32);
+        _regs.eax = (uint32_t)(val >>  0);
+        break;
+    }
+
+    case 0x80 ... 0x8f: /* jcc (near) */ {
+        int rel = (((op_bytes == 2) && !mode_64bit())
+                   ? (int32_t)insn_fetch_type(int16_t)
+                   : insn_fetch_type(int32_t));
+        if ( test_cc(b, _regs.eflags) )
+            jmp_rel(rel);
+        break;
+    }
+
+    case 0xc7: /* Grp9 (cmpxchg8b) */
+#if defined(__i386__)
+    {
+        unsigned long old_lo, old_hi;
+        generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
+        if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &old_lo, 4, ctxt)) ||
+             (rc = ops->read(ea.mem.seg, ea.mem.off+4, &old_hi, 4, ctxt)) )
+            goto done;
+        if ( (old_lo != _regs.eax) || (old_hi != _regs.edx) )
+        {
+            _regs.eax = old_lo;
+            _regs.edx = old_hi;
+            _regs.eflags &= ~EFLG_ZF;
+        }
+        else if ( ops->cmpxchg8b == NULL )
+        {
+            rc = X86EMUL_UNHANDLEABLE;
+            goto done;
+        }
+        else
+        {
+            if ( (rc = ops->cmpxchg8b(ea.mem.seg, ea.mem.off, old_lo, old_hi,
+                                      _regs.ebx, _regs.ecx, ctxt)) != 0 )
+                goto done;
+            _regs.eflags |= EFLG_ZF;
+        }
+        break;
+    }
+#elif defined(__x86_64__)
+    {
+        unsigned long old, new;
+        generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
+        if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &old, 8, ctxt)) != 0 )
+            goto done;
+        if ( ((uint32_t)(old>>0) != (uint32_t)_regs.eax) ||
+             ((uint32_t)(old>>32) != (uint32_t)_regs.edx) )
+        {
+            _regs.eax = (uint32_t)(old>>0);
+            _regs.edx = (uint32_t)(old>>32);
+            _regs.eflags &= ~EFLG_ZF;
+        }
+        else
+        {
+            new = (_regs.ecx<<32)|(uint32_t)_regs.ebx;
+            if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old,
+                                    new, 8, ctxt)) != 0 )
+                goto done;
+            _regs.eflags |= EFLG_ZF;
+        }
+        break;
+    }
+#endif
+
+    case 0xc8 ... 0xcf: /* bswap */
+        dst.type = OP_REG;
+        dst.reg  = decode_register(
+            (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
+        switch ( dst.bytes = op_bytes )
+        {
+        default: /* case 2: */
+            /* Undefined behaviour. Writes zero on all tested CPUs. */
+            dst.val = 0;
+            break;
+        case 4:
+#ifdef __x86_64__
+            __asm__ ( "bswap %k0" : "=r" (dst.val) : "0" (*dst.reg) );
+            break;
+        case 8:
+#endif
+            __asm__ ( "bswap %0" : "=r" (dst.val) : "0" (*dst.reg) );
+            break;
+        }
+        break;
+    }
+    goto writeback;
+
+ cannot_emulate:
+#if 0
+    gdprintk(XENLOG_DEBUG, "Instr:");
+    for ( ea.mem.off = ctxt->regs->eip; ea.mem.off < _regs.eip; ea.mem.off++ )
+    {
+        unsigned long x;
+        ops->insn_fetch(x86_seg_cs, ea.mem.off, &x, 1, ctxt);
+        printk(" %02x", (uint8_t)x);
+    }
+    printk("\n");
+#endif
+    return X86EMUL_UNHANDLEABLE;
+}
diff -Naurp xen/common/domain.c xen-redhat/common/domain.c
--- xen/common/domain.c
+++ xen-redhat/common/domain.c
@@ -30,6 +30,24 @@
 #include <public/vcpu.h>
 #include <acm/acm_hooks.h>
 
+/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
+unsigned int opt_dom0_vcpus_pin = 1;
+boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
+
+enum cpufreq_controller cpufreq_controller = FREQCTL_dom0_kernel;
+static void __init setup_cpufreq_option(char *str)
+{
+    if ( !strcmp(str, "dom0-kernel") )
+    {
+        cpufreq_controller = FREQCTL_dom0_kernel;
+        opt_dom0_vcpus_pin = 1;
+    } else if ( !strcmp(str, "off") || !strcmp(str, "none") ) {
+        cpufreq_controller = FREQCTL_none;
+        opt_dom0_vcpus_pin = 0;
+    }
+}
+custom_param("cpufreq", setup_cpufreq_option);
+
 /* Protect updates/reads (resp.) of domain_list and domain_hash. */
 DEFINE_SPINLOCK(domlist_update_lock);
 DEFINE_RCU_READ_LOCK(domlist_read_lock);
@@ -63,6 +81,8 @@ struct domain *alloc_domain(domid_t domi
     spin_lock_init(&d->shutdown_lock);
     INIT_LIST_HEAD(&d->page_list);
     INIT_LIST_HEAD(&d->xenpage_list);
+    /* HV */
+    atomic_set(&d->hard_virt, 0);
 
     return d;
 }
@@ -189,6 +209,9 @@ struct domain *domain_create(
     if ( domcr_flags & DOMCRF_hvm )
         d->is_hvm = 1;
 
+    if ( (domid == 0) && opt_dom0_vcpus_pin )
+        d->is_pinned = 1;
+
     rangeset_domain_initialise(d);
 
     if ( !is_idle_domain(d) )
@@ -238,7 +261,7 @@ struct domain *domain_create(
     return d;
 
  fail:
-    d->is_dying = 1;
+    d->is_dying = DOMDYING_dead;
     atomic_set(&d->refcnt, DOMAIN_DESTROYED);
     if ( init_status & INIT_arch )
         arch_domain_destroy(d);
@@ -298,26 +321,38 @@ struct domain *rcu_lock_domain_by_id(dom
 }
 
 
-void domain_kill(struct domain *d)
+int domain_kill(struct domain *d)
 {
-    domain_pause(d);
+    int rc = 0;
 
-    /* Already dying? Then bail. */
-    if ( test_and_set_bool(d->is_dying) )
+    if ( d == current->domain )
+        return -EINVAL;
+
+    /* Protected by domctl_lock. */
+    switch ( d->is_dying )
     {
-        domain_unpause(d);
-        return;
+    case DOMDYING_alive:
+        domain_pause(d);
+        d->is_dying = DOMDYING_dying;
+        evtchn_destroy(d);
+        gnttab_release_mappings(d);
+        /* fallthrough */
+    case DOMDYING_dying:
+        rc = domain_relinquish_resources(d);
+        if ( rc != 0 )
+        {
+            BUG_ON(rc != -EAGAIN);
+            break;
+        }
+        d->is_dying = DOMDYING_dead;
+        put_domain(d);
+        send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+        /* fallthrough */
+    case DOMDYING_dead:
+        break;
     }
 
-    evtchn_destroy(d);
-    gnttab_release_mappings(d);
-    domain_relinquish_resources(d);
-    put_domain(d);
-
-    /* Kick page scrubbing after domain_relinquish_resources(). */
-    page_scrub_kick();
-
-    send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+    return rc;
 }
 
 
diff -Naurp xen/common/domctl.c xen-redhat/common/domctl.c
--- xen/common/domctl.c
+++ xen-redhat/common/domctl.c
@@ -43,7 +43,8 @@ void cpumask_to_xenctl_cpumap(
 
     bitmap_long_to_byte(bytemap, cpus_addr(*cpumask), NR_CPUS);
 
-    copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes);
+    if ( copy_bytes != 0 )
+        copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes);
 
     for ( i = copy_bytes; i < guest_bytes; i++ )
         copy_to_guest_offset(xenctl_cpumap->bitmap, i, &zero, 1);
@@ -55,15 +56,20 @@ void xenctl_cpumap_to_cpumask(
     unsigned int guest_bytes, copy_bytes;
     uint8_t bytemap[(NR_CPUS + 7) / 8];
 
+    if ( guest_handle_is_null(xenctl_cpumap->bitmap) )
+        return;
+
     guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8;
     copy_bytes  = min_t(unsigned int, guest_bytes, sizeof(bytemap));
 
-    cpus_clear(*cpumask);
-
-    if ( guest_handle_is_null(xenctl_cpumap->bitmap) )
-        return;
+    memset(bytemap, 0, sizeof(bytemap));
 
-    copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes);
+    if ( copy_bytes != 0 )
+    {
+        copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes);
+        if ( (xenctl_cpumap->nr_cpus & 7) && (guest_bytes <= sizeof(bytemap)) )
+            bytemap[guest_bytes-1] &= ~(0xff << (xenctl_cpumap->nr_cpus & 7));
+    }
 
     bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, NR_CPUS);
 }
@@ -114,10 +120,10 @@ void getdomaininfo(struct domain *d, str
     info->cpu_time = cpu_time;
 
     info->flags = flags |
-        (d->is_dying                ? XEN_DOMINF_dying    : 0) |
-        (d->is_shut_down            ? XEN_DOMINF_shutdown : 0) |
-        (d->is_paused_by_controller ? XEN_DOMINF_paused   : 0) |
-        (d->debugger_attached       ? XEN_DOMINF_debugged : 0) |
+        ((d->is_dying == DOMDYING_dead) ? XEN_DOMINF_dying    : 0) |
+        (d->is_shut_down                ? XEN_DOMINF_shutdown : 0) |
+        (d->is_paused_by_controller     ? XEN_DOMINF_paused   : 0) |
+        (d->debugger_attached           ? XEN_DOMINF_debugged : 0) |
         d->shutdown_code << XEN_DOMINF_shutdownshift;
 
     if ( is_hvm_domain(d) )
@@ -188,7 +194,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     if ( op->interface_version != XEN_DOMCTL_INTERFACE_VERSION )
         return -EACCES;
 
-    spin_lock(&domctl_lock);
+    if ( !spin_trylock(&domctl_lock) )
+      return hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl);
 
     switch ( op->cmd )
     {
@@ -222,13 +229,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL )
             goto svc_out;
 
-        if ( !IS_COMPAT(v->domain) )
-            ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
 #ifdef CONFIG_COMPAT
+        if ( !is_pv_32on64_vcpu(v) )
+            ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
         else
             ret = copy_from_guest(c.cmp,
                                   guest_handle_cast(op->u.vcpucontext.ctxt,
                                                     void), 1);
+#else
+        ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
 #endif
         ret = ret ? -EFAULT : 0;
 
@@ -397,10 +406,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         {
             ret = -EINVAL;
             if ( d != current->domain )
-            {
-                domain_kill(d);
-                ret = 0;
-            }
+                ret = domain_kill(d);
             rcu_unlock_domain(d);
         }
     }
@@ -527,12 +533,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         if ( v != current )
             vcpu_unpause(v);
 
-        if ( !IS_COMPAT(v->domain) )
-            ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
 #ifdef CONFIG_COMPAT
+        if ( !is_pv_32on64_vcpu(v) )
+            ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
         else
             ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt,
                                                   void), c.cmp, 1);
+#else
+        ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
 #endif
 
         if ( copy_to_guest(u_domctl, op, 1) || ret )
diff -Naurp xen/common/event_channel.c xen-redhat/common/event_channel.c
--- xen/common/event_channel.c
+++ xen-redhat/common/event_channel.c
@@ -118,7 +118,7 @@ static long evtchn_alloc_unbound(evtchn_
     if ( (d = rcu_lock_domain_by_id(dom)) == NULL )
         return -ESRCH;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( (port = get_free_port(d)) < 0 )
         ERROR_EXIT(port);
@@ -131,7 +131,7 @@ static long evtchn_alloc_unbound(evtchn_
     alloc->port = port;
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     rcu_unlock_domain(d);
 
@@ -159,14 +159,14 @@ static long evtchn_bind_interdomain(evtc
     /* Avoid deadlock by first acquiring lock of domain with smaller id. */
     if ( ld < rd )
     {
-        spin_lock(&ld->evtchn_lock);
-        spin_lock(&rd->evtchn_lock);
+        spin_lock(&ld->event_lock);
+        spin_lock(&rd->event_lock);
     }
     else
     {
         if ( ld != rd )
-            spin_lock(&rd->evtchn_lock);
-        spin_lock(&ld->evtchn_lock);
+            spin_lock(&rd->event_lock);
+        spin_lock(&ld->event_lock);
     }
 
     if ( (lport = get_free_port(ld)) < 0 )
@@ -197,9 +197,9 @@ static long evtchn_bind_interdomain(evtc
     bind->local_port = lport;
 
  out:
-    spin_unlock(&ld->evtchn_lock);
+    spin_unlock(&ld->event_lock);
     if ( ld != rd )
-        spin_unlock(&rd->evtchn_lock);
+        spin_unlock(&rd->event_lock);
     
     rcu_unlock_domain(rd);
 
@@ -225,7 +225,7 @@ static long evtchn_bind_virq(evtchn_bind
          ((v = d->vcpu[vcpu]) == NULL) )
         return -ENOENT;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( v->virq_to_evtchn[virq] != 0 )
         ERROR_EXIT(-EEXIST);
@@ -241,7 +241,7 @@ static long evtchn_bind_virq(evtchn_bind
     v->virq_to_evtchn[virq] = bind->port = port;
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     return rc;
 }
@@ -258,7 +258,7 @@ static long evtchn_bind_ipi(evtchn_bind_
          (d->vcpu[vcpu] == NULL) )
         return -ENOENT;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( (port = get_free_port(d)) < 0 )
         ERROR_EXIT(port);
@@ -270,7 +270,7 @@ static long evtchn_bind_ipi(evtchn_bind_
     bind->port = port;
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     return rc;
 }
@@ -289,7 +289,7 @@ static long evtchn_bind_pirq(evtchn_bind
     if ( !irq_access_permitted(d, pirq) )
         return -EPERM;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( d->pirq_to_evtchn[pirq] != 0 )
         ERROR_EXIT(-EEXIST);
@@ -314,7 +314,7 @@ static long evtchn_bind_pirq(evtchn_bind
     bind->port = port;
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     return rc;
 }
@@ -329,7 +329,7 @@ static long __evtchn_close(struct domain
     long           rc = 0;
 
  again:
-    spin_lock(&d1->evtchn_lock);
+    spin_lock(&d1->event_lock);
 
     if ( !port_is_valid(d1, port1) )
     {
@@ -357,8 +357,8 @@ static long __evtchn_close(struct domain
         break;
 
     case ECS_PIRQ:
-        if ( (rc = pirq_guest_unbind(d1, chn1->u.pirq)) == 0 )
-            d1->pirq_to_evtchn[chn1->u.pirq] = 0;
+        pirq_guest_unbind(d1, chn1->u.pirq);
+        d1->pirq_to_evtchn[chn1->u.pirq] = 0;
         break;
 
     case ECS_VIRQ:
@@ -381,12 +381,12 @@ static long __evtchn_close(struct domain
 
             if ( d1 < d2 )
             {
-                spin_lock(&d2->evtchn_lock);
+                spin_lock(&d2->event_lock);
             }
             else if ( d1 != d2 )
             {
-                spin_unlock(&d1->evtchn_lock);
-                spin_lock(&d2->evtchn_lock);
+                spin_unlock(&d1->event_lock);
+                spin_lock(&d2->event_lock);
                 goto again;
             }
         }
@@ -426,11 +426,11 @@ static long __evtchn_close(struct domain
     if ( d2 != NULL )
     {
         if ( d1 != d2 )
-            spin_unlock(&d2->evtchn_lock);
+            spin_unlock(&d2->event_lock);
         put_domain(d2);
     }
 
-    spin_unlock(&d1->evtchn_lock);
+    spin_unlock(&d1->event_lock);
 
     return rc;
 }
@@ -449,11 +449,11 @@ long evtchn_send(unsigned int lport)
     struct vcpu   *rvcpu;
     int            rport, ret = 0;
 
-    spin_lock(&ld->evtchn_lock);
+    spin_lock(&ld->event_lock);
 
     if ( unlikely(!port_is_valid(ld, lport)) )
     {
-        spin_unlock(&ld->evtchn_lock);
+        spin_unlock(&ld->event_lock);
         return -EINVAL;
     }
 
@@ -462,7 +462,7 @@ long evtchn_send(unsigned int lport)
     /* Guest cannot send via a Xen-attached event channel. */
     if ( unlikely(lchn->consumer_is_xen) )
     {
-        spin_unlock(&ld->evtchn_lock);
+        spin_unlock(&ld->event_lock);
         return -EINVAL;
     }
 
@@ -495,7 +495,7 @@ long evtchn_send(unsigned int lport)
         ret = -EINVAL;
     }
 
-    spin_unlock(&ld->evtchn_lock);
+    spin_unlock(&ld->event_lock);
 
     return ret;
 }
@@ -517,7 +517,7 @@ void evtchn_set_pending(struct vcpu *v, 
         return;
 
     if ( !test_bit        (port, __shared_info_addr(d, s, evtchn_mask)) &&
-         !test_and_set_bit(port / BITS_PER_GUEST_LONG(d),
+         !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d),
                            vcpu_info_addr(v, evtchn_pending_sel)) )
     {
         vcpu_mark_events_pending(v);
@@ -604,7 +604,7 @@ static long evtchn_status(evtchn_status_
     if ( (d = rcu_lock_domain_by_id(dom)) == NULL )
         return -ESRCH;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( !port_is_valid(d, port) )
     {
@@ -647,7 +647,7 @@ static long evtchn_status(evtchn_status_
     status->vcpu = chn->notify_vcpu_id;
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
     rcu_unlock_domain(d);
     return rc;
 }
@@ -662,7 +662,7 @@ long evtchn_bind_vcpu(unsigned int port,
     if ( (vcpu_id >= ARRAY_SIZE(d->vcpu)) || (d->vcpu[vcpu_id] == NULL) )
         return -ENOENT;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( !port_is_valid(d, port) )
     {
@@ -698,7 +698,7 @@ long evtchn_bind_vcpu(unsigned int port,
     }
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
     return rc;
 }
 
@@ -710,11 +710,11 @@ static long evtchn_unmask(evtchn_unmask_
     int            port = unmask->port;
     struct vcpu   *v;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( unlikely(!port_is_valid(d, port)) )
     {
-        spin_unlock(&d->evtchn_lock);
+        spin_unlock(&d->event_lock);
         return -EINVAL;
     }
 
@@ -726,13 +726,13 @@ static long evtchn_unmask(evtchn_unmask_
      */
     if ( test_and_clear_bit(port, __shared_info_addr(d, s, evtchn_mask)) &&
          test_bit          (port, __shared_info_addr(d, s, evtchn_pending)) &&
-         !test_and_set_bit (port / BITS_PER_GUEST_LONG(d),
+         !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d),
                             vcpu_info_addr(v, evtchn_pending_sel)) )
     {
         vcpu_mark_events_pending(v);
     }
 
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     return 0;
 }
@@ -883,7 +883,7 @@ int alloc_unbound_xen_event_channel(
     struct domain *d = local_vcpu->domain;
     int            port;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
 
     if ( (port = get_free_port(d)) < 0 )
         goto out;
@@ -895,7 +895,7 @@ int alloc_unbound_xen_event_channel(
     chn->u.unbound.remote_domid = remote_domid;
 
  out:
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     return port;
 }
@@ -907,11 +907,11 @@ void free_xen_event_channel(
     struct evtchn *chn;
     struct domain *d = local_vcpu->domain;
 
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
     chn = evtchn_from_port(d, port);
     BUG_ON(!chn->consumer_is_xen);
     chn->consumer_is_xen = 0;
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 
     (void)__evtchn_close(d, port);
 }
@@ -923,7 +923,7 @@ void notify_via_xen_event_channel(int lp
     struct domain *ld = current->domain, *rd;
     int            rport;
 
-    spin_lock(&ld->evtchn_lock);
+    spin_lock(&ld->event_lock);
 
     ASSERT(port_is_valid(ld, lport));
     lchn = evtchn_from_port(ld, lport);
@@ -937,13 +937,13 @@ void notify_via_xen_event_channel(int lp
         evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
     }
 
-    spin_unlock(&ld->evtchn_lock);
+    spin_unlock(&ld->event_lock);
 }
 
 
 int evtchn_init(struct domain *d)
 {
-    spin_lock_init(&d->evtchn_lock);
+    spin_lock_init(&d->event_lock);
     if ( get_free_port(d) != 0 )
         return -EINVAL;
     evtchn_from_port(d, 0)->state = ECS_RESERVED;
@@ -957,7 +957,7 @@ void evtchn_destroy(struct domain *d)
 
     /* After this barrier no new event-channel allocations can occur. */
     BUG_ON(!d->is_dying);
-    spin_barrier(&d->evtchn_lock);
+    spin_barrier(&d->event_lock);
 
     /* Close all existing event channels. */
     for ( i = 0; port_is_valid(d, i); i++ )
@@ -967,10 +967,10 @@ void evtchn_destroy(struct domain *d)
     }
 
     /* Free all event-channel buckets. */
-    spin_lock(&d->evtchn_lock);
+    spin_lock(&d->event_lock);
     for ( i = 0; i < NR_EVTCHN_BUCKETS; i++ )
         xfree(d->evtchn[i]);
-    spin_unlock(&d->evtchn_lock);
+    spin_unlock(&d->event_lock);
 }
 
 /*
diff -Naurp xen/common/gdbstub.c xen-redhat/common/gdbstub.c
--- xen/common/gdbstub.c
+++ xen-redhat/common/gdbstub.c
@@ -478,13 +478,13 @@ process_command(struct cpu_user_regs *re
     return resume;
 }
 
-static struct gdb_context
+struct gdb_context
 __gdb_ctx = {
     .serhnd  = -1,
     .running = ATOMIC_INIT(1),
     .signum  = 1
 };
-static struct gdb_context *gdb_ctx = &__gdb_ctx;
+struct gdb_context *gdb_ctx = &__gdb_ctx;
 
 static void
 gdbstub_console_puts(const char *str)
diff -Naurp xen/common/grant_table.c xen-redhat/common/grant_table.c
--- xen/common/grant_table.c
+++ xen-redhat/common/grant_table.c
@@ -809,6 +809,7 @@ gnttab_transfer(
     grant_entry_t *sha;
     struct gnttab_transfer gop;
     unsigned long mfn;
+    unsigned int max_bitsize;
 
     for ( i = 0; i < count; i++ )
     {
@@ -857,6 +858,34 @@ gnttab_transfer(
             goto copyback;
         }
 
+        max_bitsize = domain_clamp_alloc_bitsize(
+            e, BITS_PER_LONG+PAGE_SHIFT-1);
+        if ( (1UL << (max_bitsize - PAGE_SHIFT)) <= mfn )
+        {
+            struct page_info *new_page;
+            void *sp, *dp;
+
+            new_page = alloc_domheap_pages(NULL, 0, MEMF_bits(max_bitsize));
+            if ( new_page == NULL )
+            {
+                rcu_unlock_domain(e);
+                page->count_info &= ~(PGC_count_mask|PGC_allocated);
+                free_domheap_page(page);
+                gop.status = GNTST_address_too_big;
+                goto copyback;
+            }
+
+            sp = map_domain_page(mfn);
+            dp = map_domain_page(page_to_mfn(new_page));
+            memcpy(dp, sp, PAGE_SIZE);
+            unmap_domain_page(dp);
+            unmap_domain_page(sp);
+
+            page->count_info &= ~(PGC_count_mask|PGC_allocated);
+            free_domheap_page(page);
+            page = new_page;
+        }
+
         spin_lock(&e->page_alloc_lock);
 
         /*
@@ -896,7 +925,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, 0);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -Naurp xen/common/kernel.c xen-redhat/common/kernel.c
--- xen/common/kernel.c
+++ xen-redhat/common/kernel.c
@@ -80,7 +80,10 @@ void cmdline_parse(char *cmdline)
                 break;
             case OPT_BOOL:
             case OPT_INVBOOL:
-                if ( !strcmp("no", optval) || !strcmp("off", optval) )
+                if ( !strcmp("no", optval) ||
+                     !strcmp("off", optval) ||
+                     !strcmp("false", optval) ||
+                     !strcmp("0", optval) )
                     bool_assert = !bool_assert;
                 if ( param->type == OPT_INVBOOL )
                     bool_assert = !bool_assert;
@@ -217,6 +220,10 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
                     (1U << XENFEAT_auto_translated_physmap);
             if ( supervisor_mode_kernel )
                 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
+#ifdef CONFIG_X86
+            if ( !is_hvm_vcpu(current) )
+                fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad;
+#endif
             break;
         default:
             return -EINVAL;
diff -Naurp xen/common/kexec.c xen-redhat/common/kexec.c
--- xen/common/kexec.c
+++ xen-redhat/common/kexec.c
@@ -42,6 +42,9 @@ static unsigned long kexec_flags = 0; /*
 
 static spinlock_t kexec_lock = SPIN_LOCK_UNLOCKED;
 
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+static size_t vmcoreinfo_size = 0;
+
 xen_kexec_reserve_t kexec_crash_area;
 
 static void __init parse_crashkernel(const char *str)
@@ -222,6 +225,13 @@ static int kexec_get(cpu)(xen_kexec_rang
     return 0;
 }
 
+static int kexec_get(vmcoreinfo)(xen_kexec_range_t *range)
+{
+    range->start = __pa((unsigned long)vmcoreinfo_data);
+    range->size = VMCOREINFO_BYTES;
+    return 0;
+}
+
 static int kexec_get(range)(XEN_GUEST_HANDLE(void) uarg)
 {
     xen_kexec_range_t range;
@@ -241,6 +251,9 @@ static int kexec_get(range)(XEN_GUEST_HA
     case KEXEC_RANGE_MA_CPU:
         ret = kexec_get(cpu)(&range);
         break;
+    case KEXEC_RANGE_MA_VMCOREINFO:
+        ret = kexec_get(vmcoreinfo)(&range);
+        break;
     }
 
     if ( ret == 0 && unlikely(copy_to_guest(uarg, &range, 1)) )
@@ -269,6 +282,56 @@ static int kexec_load_get_bits(int type,
     return 0;
 }
 
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+    va_list args;
+    char buf[0x50];
+    int r;
+    size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1);
+
+    if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES)
+        return;
+
+    va_start(args, fmt);
+    r = vsnprintf(buf, sizeof(buf), fmt, args);
+    va_end(args);
+
+    memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r);
+
+    vmcoreinfo_size += r;
+}
+
+static void crash_save_vmcoreinfo(void)
+{
+    size_t data_size;
+
+    if (vmcoreinfo_size > 0)    /* already saved */
+        return;
+
+    data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1));
+    setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size);
+
+    VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+    VMCOREINFO_SYMBOL(domain_list);
+    VMCOREINFO_SYMBOL(frame_table);
+    VMCOREINFO_SYMBOL(alloc_bitmap);
+    VMCOREINFO_SYMBOL(max_page);
+    VMCOREINFO_SYMBOL(xenheap_phys_end);
+
+    VMCOREINFO_STRUCT_SIZE(page_info);
+    VMCOREINFO_STRUCT_SIZE(domain);
+
+    VMCOREINFO_OFFSET(page_info, count_info);
+    VMCOREINFO_OFFSET_ALIAS(page_info, u, _domain);
+    VMCOREINFO_OFFSET(domain, domain_id);
+    VMCOREINFO_OFFSET(domain, next_in_list);
+
+#ifdef ARCH_CRASH_SAVE_VMCOREINFO
+    arch_crash_save_vmcoreinfo();
+#endif
+}
+
 #endif
 
 static int kexec_load_unload(unsigned long op, XEN_GUEST_HANDLE(void) uarg)
@@ -307,6 +370,9 @@ static int kexec_load_unload(unsigned lo
             /* Make new image the active one */
             change_bit(bit, &kexec_flags);
         }
+#ifndef COMPAT
+        crash_save_vmcoreinfo();
+#endif
     }
 
     /* Unload the old image if present and load successful */
diff -Naurp xen/common/keyhandler.c xen-redhat/common/keyhandler.c
--- xen/common/keyhandler.c
+++ xen-redhat/common/keyhandler.c
@@ -36,10 +36,10 @@ static void keypress_softirq(void)
 {
     keyhandler_t *h;
     unsigned char key = keypress_key;
-    console_start_log_everything();
+    console_start_sync();
     if ( (h = key_table[key].u.handler) != NULL )
         (*h)(key);
-    console_end_log_everything();
+    console_end_sync();
 }
 
 void handle_keypress(unsigned char key, struct cpu_user_regs *regs)
@@ -48,10 +48,10 @@ void handle_keypress(unsigned char key, 
 
     if ( !in_irq() || (key_table[key].flags & KEYHANDLER_IRQ_CALLBACK) )
     {
-        console_start_log_everything();
+        console_start_sync();
         if ( (h = key_table[key].u.irq_handler) != NULL )
             (*h)(key, regs);
-        console_end_log_everything();
+        console_end_sync();
     }
     else
     {
@@ -205,7 +205,7 @@ static void dump_domains(unsigned char k
                    test_bit(v->virq_to_evtchn[VIRQ_DEBUG], 
                             shared_info_addr(d, evtchn_mask)),
                    test_bit(v->virq_to_evtchn[VIRQ_DEBUG] /
-                            BITS_PER_GUEST_LONG(d),
+                            BITS_PER_EVTCHN_WORD(d),
                             vcpu_info_addr(v, evtchn_pending_sel)));
             send_guest_vcpu_virq(v, VIRQ_DEBUG);
         }
diff -Naurp xen/common/memory.c xen-redhat/common/memory.c
--- xen/common/memory.c
+++ xen-redhat/common/memory.c
@@ -129,8 +129,8 @@ static void populate_physmap(struct memo
 
         if ( unlikely(paging_mode_translate(d)) )
         {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                guest_physmap_add_page(d, gpfn + j, mfn + j);
+            if ( guest_physmap_add_page(d, gpfn, mfn, a->extent_order) )
+                    goto out;
         }
         else
         {
@@ -173,7 +173,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, 0);
 
     put_page(page);
 
@@ -309,18 +309,6 @@ static long memory_exchange(XEN_GUEST_HA
         goto fail_early;
     }
 
-    if ( (exch.out.address_bits != 0) &&
-         (exch.out.address_bits <
-          (get_order_from_pages(max_page) + PAGE_SHIFT)) )
-    {
-        if ( exch.out.address_bits <= PAGE_SHIFT )
-        {
-            rc = -ENOMEM;
-            goto fail_early;
-        }
-        memflags = MEMF_bits(exch.out.address_bits);
-    }
-
     if ( exch.in.extent_order <= exch.out.extent_order )
     {
         in_chunk_order  = exch.out.extent_order - exch.in.extent_order;
@@ -343,6 +331,9 @@ static long memory_exchange(XEN_GUEST_HA
     }
     d = current->domain;
 
+    memflags |= MEMF_bits(domain_clamp_alloc_bitsize(
+        d, exch.out.address_bits ? : (BITS_PER_LONG+PAGE_SHIFT)));
+
     cpu = select_local_cpu(d);
 
     for ( i = (exch.nr_exchanged >> in_chunk_order);
@@ -415,7 +406,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 0);
             put_page(page);
         }
 
@@ -436,8 +427,9 @@ static long memory_exchange(XEN_GUEST_HA
             mfn = page_to_mfn(page);
             if ( unlikely(paging_mode_translate(d)) )
             {
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    guest_physmap_add_page(d, gpfn + k, mfn + k);
+                /* Ignore failure here. There's nothing we can do. */
+                    (void)guest_physmap_add_page(d, gpfn, mfn,
+                                        exch.out.extent_order);
             }
             else
             {
diff -Naurp xen/common/page_alloc.c xen-redhat/common/page_alloc.c
--- xen/common/page_alloc.c
+++ xen-redhat/common/page_alloc.c
@@ -54,7 +54,7 @@ boolean_param("bootscrub", opt_bootscrub
 /*
  * Bit width of the DMA heap.
  */
-static unsigned int dma_bitsize = CONFIG_DMA_BITSIZE;
+static unsigned int dma_bitsize = 0;
 static void __init parse_dma_bits(char *s)
 {
     unsigned int v = simple_strtol(s, NULL, 0);
@@ -84,16 +84,12 @@ custom_param("dma_emergency_pool", parse
 #define round_pgdown(_p)  ((_p)&PAGE_MASK)
 #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
 
-static DEFINE_SPINLOCK(page_scrub_lock);
-LIST_HEAD(page_scrub_list);
-static unsigned long scrub_pages;
-
 /*********************
  * ALLOCATION BITMAP
  *  One bit per page of memory. Bit set => page is allocated.
  */
 
-static unsigned long *alloc_bitmap;
+unsigned long *alloc_bitmap;
 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
 
 #define allocated_in_map(_pn)                       \
@@ -366,7 +362,6 @@ static struct page_info *alloc_heap_page
     struct page_info *pg;
 
     ASSERT(node >= 0);
-    ASSERT(node < num_nodes);
     ASSERT(zone_lo <= zone_hi);
     ASSERT(zone_hi < NR_ZONES);
 
@@ -395,8 +390,9 @@ static struct page_info *alloc_heap_page
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
         /* Pick next node, wrapping around if needed. */
-        if ( ++node == num_nodes )
-            node = 0;
+        node = next_node(node, node_online_map);
+        if (node == MAX_NUMNODES)
+            node = first_node(node_online_map);
     }
 
     /* No suitable memory blocks. Fail the request. */
@@ -458,7 +454,6 @@ static void free_heap_pages(
     ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
     ASSERT(node >= 0);
-    ASSERT(node < num_online_nodes());
 
     for ( i = 0; i < (1 << order); i++ )
     {
@@ -571,13 +566,13 @@ void init_heap_pages(
 static unsigned long avail_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
 {
-    unsigned int i, zone, num_nodes = num_online_nodes();
+    unsigned int i, zone;
     unsigned long free_pages = 0;
 
     if ( zone_hi >= NR_ZONES )
         zone_hi = NR_ZONES - 1;
 
-    for ( i = 0; i < num_nodes; i++ )
+    for_each_online_node(i)
     {
         if ( !avail[i] )
             continue;
@@ -609,6 +604,20 @@ void __init end_boot_allocator(void)
             init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
     }
 
+    if (dma_bitsize == 0)
+    {
+#ifdef CONFIG_X86
+	if (num_online_nodes() > 1)
+	    dma_bitsize = min_t(unsigned int,
+		fls(NODE_DATA(0)->node_spanned_pages) - 1 + PAGE_SHIFT - 2,
+			32);
+	else
+	    dma_bitsize = CONFIG_DMA_BITSIZE;
+#else
+	dma_bitsize = CONFIG_DMA_BITSIZE;
+#endif
+    }
+
     printk("Domain heap initialised: DMA width %u bits\n", dma_bitsize);
 }
 #undef avail_for_domheap
@@ -620,7 +629,6 @@ void __init end_boot_allocator(void)
  */
 void __init scrub_heap_pages(void)
 {
-    void *p;
     unsigned long mfn;
 
     if ( !opt_bootscrub )
@@ -644,21 +652,7 @@ void __init scrub_heap_pages(void)
 
         /* Re-check page status with lock held. */
         if ( !allocated_in_map(mfn) )
-        {
-            if ( is_xen_heap_frame(mfn_to_page(mfn)) )
-            {
-                p = page_to_virt(mfn_to_page(mfn));
-                memguard_unguard_range(p, PAGE_SIZE);
-                clear_page(p);
-                memguard_guard_range(p, PAGE_SIZE);
-            }
-            else
-            {
-                p = map_domain_page(mfn);
-                clear_page(p);
-                unmap_domain_page(p);
-            }
-        }
+            scrub_one_page(mfn_to_page(mfn));
 
         spin_unlock(&heap_lock);
     }
@@ -817,15 +811,13 @@ struct page_info *__alloc_domheap_pages(
 
     ASSERT(!in_irq());
 
-    if ( bits )
-    {
-        bits = domain_clamp_alloc_bitsize(d, bits);
-        if ( bits <= (PAGE_SHIFT + 1) )
-            return NULL;
-        bits -= PAGE_SHIFT + 1;
-        if ( bits < zone_hi )
-            zone_hi = bits;
-    }
+    bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
+    if ( bits <= (PAGE_SHIFT + 1) )
+        return NULL;
+
+    bits -= PAGE_SHIFT + 1;
+    if ( bits < zone_hi )
+        zone_hi = bits;
 
     if ( (zone_hi + PAGE_SHIFT) >= dma_bitsize )
     {
@@ -897,26 +889,16 @@ void free_domheap_pages(struct page_info
 
         spin_unlock_recursive(&d->page_alloc_lock);
 
-        if ( likely(!d->is_dying) )
-        {
-            free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order);
-        }
-        else
-        {
-            /*
-             * Normally we expect a domain to clear pages before freeing them,
-             * if it cares about the secrecy of their contents. However, after
-             * a domain has died we assume responsibility for erasure.
-             */
+        /*
+         * Normally we expect a domain to clear pages before freeing them,
+         * if it cares about the secrecy of their contents. However, after
+         * a domain has died we assume responsibility for erasure.
+         */
+        if ( unlikely(d->is_dying) )
             for ( i = 0; i < (1 << order); i++ )
-            {
-                page_set_owner(&pg[i], NULL);
-                spin_lock(&page_scrub_lock);
-                list_add(&pg[i].list, &page_scrub_list);
-                scrub_pages++;
-                spin_unlock(&page_scrub_lock);
-            }
-        }
+                scrub_one_page(&pg[i]);
+
+        free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order);
     }
     else
     {
@@ -929,6 +911,23 @@ void free_domheap_pages(struct page_info
         put_domain(d);
 }
 
+unsigned long avail_domheap_pages_region(
+    unsigned int node, unsigned int min_width, unsigned int max_width)
+{
+    int zone_lo, zone_hi;
+
+    zone_lo = min_width ? (min_width - (PAGE_SHIFT + 1)) : (MEMZONE_XEN + 1);
+    zone_lo = max_t(int, MEMZONE_XEN + 1, zone_lo);
+    zone_lo = min_t(int, NR_ZONES - 1, zone_lo);
+
+    zone_hi = max_width ? (max_width - (PAGE_SHIFT + 1)) : (NR_ZONES - 1);
+    zone_hi = max_t(int, MEMZONE_XEN + 1, zone_hi);
+    zone_hi = min_t(int, NR_ZONES - 1, zone_hi);
+
+    return avail_heap_pages(zone_lo, zone_hi, node);
+}
+
+
 
 unsigned long avail_domheap_pages(void)
 {
@@ -950,11 +949,6 @@ unsigned long avail_domheap_pages(void)
     return avail_nrm + avail_dma;
 }
 
-unsigned long avail_nodeheap_pages(int node)
-{
-    return avail_heap_pages(0, NR_ZONES - 1, node);
-}
-
 static void pagealloc_keyhandler(unsigned char key)
 {
     unsigned int zone = MEMZONE_XEN;
@@ -992,70 +986,19 @@ static __init int pagealloc_keyhandler_i
 }
 __initcall(pagealloc_keyhandler_init);
 
-
-
-/*************************
- * PAGE SCRUBBING
- */
-
-static DEFINE_PER_CPU(struct timer, page_scrub_timer);
-
-static void page_scrub_softirq(void)
+void scrub_one_page(struct page_info *pg)
 {
-    struct list_head *ent;
-    struct page_info  *pg;
-    void             *p;
-    int               i;
-    s_time_t          start = NOW();
-
-    /* Aim to do 1ms of work every 10ms. */
-    do {
-        spin_lock(&page_scrub_lock);
-
-        if ( unlikely((ent = page_scrub_list.next) == &page_scrub_list) )
-        {
-            spin_unlock(&page_scrub_lock);
-            return;
-        }
-        
-        /* Peel up to 16 pages from the list. */
-        for ( i = 0; i < 16; i++ )
-        {
-            if ( ent->next == &page_scrub_list )
-                break;
-            ent = ent->next;
-        }
-        
-        /* Remove peeled pages from the list. */
-        ent->next->prev = &page_scrub_list;
-        page_scrub_list.next = ent->next;
-        scrub_pages -= (i+1);
-
-        spin_unlock(&page_scrub_lock);
+    void *p = map_domain_page(page_to_mfn(pg));
 
-        /* Working backwards, scrub each page in turn. */
-        while ( ent != &page_scrub_list )
-        {
-            pg = list_entry(ent, struct page_info, list);
-            ent = ent->prev;
-            p = map_domain_page(page_to_mfn(pg));
-            clear_page(p);
-            unmap_domain_page(p);
-            free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, 0);
-        }
-    } while ( (NOW() - start) < MILLISECS(1) );
-
-    set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(10));
-}
-
-static void page_scrub_timer_fn(void *unused)
-{
-    page_scrub_schedule_work();
-}
+#ifndef NDEBUG
+    /* Avoid callers relying on allocations returning zeroed pages. */
+    memset(p, 0xc2, PAGE_SIZE);
+#else
+    /* For a production build, clear_page() is the fastest way to scrub. */
+    clear_page(p);
+#endif
 
-unsigned long avail_scrub_pages(void)
-{
-    return scrub_pages;
+    unmap_domain_page(p);
 }
 
 static void dump_heap(unsigned char key)
@@ -1083,18 +1026,6 @@ static __init int register_heap_trigger(
 }
 __initcall(register_heap_trigger);
 
-
-static __init int page_scrub_init(void)
-{
-    int cpu;
-    for_each_cpu ( cpu )
-        init_timer(&per_cpu(page_scrub_timer, cpu),
-                   page_scrub_timer_fn, NULL, cpu);
-    open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
-    return 0;
-}
-__initcall(page_scrub_init);
-
 /*
  * Local variables:
  * mode: C
diff -Naurp xen/common/sched_credit.c xen-redhat/common/sched_credit.c
--- xen/common/sched_credit.c
+++ xen-redhat/common/sched_credit.c
@@ -6,7 +6,8 @@
  *      Author: Emmanuel Ackaouy
  *
  * Description: Credit-based SMP CPU scheduler
- */
+ *
+*/
 
 #include <xen/config.h>
 #include <xen/init.h>
@@ -48,14 +49,24 @@
 #define CSCHED_CREDITS_PER_ACCT     \
     (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
 
+/* opt_hardvirt: This enables the both the dom0 bypass and
+ * hard virt dom0.  By default these are disabled so as to
+ * keep behavior as expected for workloads running on an
+ * existing dom0.
+ */
+static int opt_hardvirt = 0;
+boolean_param("hardvirt", opt_hardvirt);
+
 
 /*
  * Priorities
  */
-#define CSCHED_PRI_TS_BOOST      0      /* time-share waking up */
 #define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
 #define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
 #define CSCHED_PRI_IDLE         -64     /* idle */
+#define CSCHED_PRI_RR           10      /* Dom-0 and Hard-Virts - HV*/
+
+#define NUMBER_DOM0_VCPUS_PRESENT(_cpu) (CSCHED_PCPU(_cpu)->number_of_dom0_vcpus_present)
 
 
 /*
@@ -123,7 +134,12 @@
     _MACRO(dom_init)                        \
     _MACRO(dom_destroy)                     \
     _MACRO(vcpu_init)                       \
-    _MACRO(vcpu_destroy)
+    _MACRO(vcpu_destroy)                    \
+    _MACRO(tickle_hard_virt_none)                       \
+    _MACRO(rt_imbalance)                                \
+    _MACRO(rt_vcpu_migrate)                             \
+    _MACRO(rt_steal_trylock_failed)
+
 
 #ifndef NDEBUG
 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
@@ -188,6 +204,8 @@ struct csched_pcpu {
     uint32_t runq_sort_last;
     struct timer ticker;
     unsigned int tick;
+    uint16_t number_of_dom0_vcpus_present;
+    uint16_t unused; /* HV */
 };
 
 /*
@@ -201,6 +219,11 @@ struct csched_vcpu {
     atomic_t credit;
     uint16_t flags;
     int16_t pri;
+    int credit_real_incr;
+    atomic_t hard_virt_pcpu;  /* HV */
+    uint16_t hard_virt_pcpu_state_change; /* HV */
+    uint16_t unused;
+
 #ifdef CSCHED_STATS
     struct {
         int credit_last;
@@ -239,6 +262,9 @@ struct csched_private {
     int credit_balance;
     uint32_t runq_sort;
     CSCHED_STATS_DEFINE()
+    spinlock_t hard_virt_lock; /* HV */
+    cpumask_t hard_virt_none; /* 1 by default - meaning it has no RT vcpu */
+    cpumask_t hard_virt_multiple; /* 0 by default - meaning it has no more than 1 RT vcpu */
 };
 
 
@@ -249,6 +275,9 @@ static struct csched_private csched_priv
 
 static void csched_tick(void *_cpu);
 
+/* HV - Protected by hard_virt_lock */
+static unsigned int total_hard_virts=0;
+
 static inline int
 __cycle_cpu(int cpu, const cpumask_t *mask)
 {
@@ -275,14 +304,92 @@ __runq_insert(unsigned int cpu, struct c
 {
     const struct list_head * const runq = RUNQ(cpu);
     struct list_head *iter;
+    int credit, new_credit;
+
+
+    BUG_ON( __vcpu_on_runq(svc) );
+    BUG_ON( cpu != svc->vcpu->processor );
+
+    /* HV - No race condition for hard_virt_pcpu_state_change here */
+    if (svc->hard_virt_pcpu_state_change)
+    {
+       svc->hard_virt_pcpu_state_change = 0;
+       if (atomic_read(&svc->hard_virt_pcpu))
+          svc->pri = CSCHED_PRI_RR;
+       else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0)
+       {
+           if (atomic_read(&svc->credit) > 0)
+               svc->pri = CSCHED_PRI_TS_UNDER;
+           else
+               svc->pri = CSCHED_PRI_TS_OVER;
+       }
+    }
+    if (svc->vcpu->domain->domain_id == 0)
+       NUMBER_DOM0_VCPUS_PRESENT(cpu)++;
+
+    new_credit = atomic_read(&svc->credit);
+
+    if (new_credit >= CSCHED_CREDITS_PER_TSLICE/2)
+    {
+        list_for_each( iter, runq )
+        {
+            const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+            if (svc->pri > iter_svc->pri )
+                break;
+            credit = atomic_read(&iter_svc->credit);
+            if ( svc->pri == iter_svc->pri && credit < (CSCHED_CREDITS_PER_TSLICE/2) )
+                break;
+        }
+    }
+    else
+    {
+         list_for_each( iter, runq )
+        {
+            const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+            if ( svc->pri > iter_svc->pri )
+                break;
+        }
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+__runq_insert_special(unsigned int cpu, struct csched_vcpu *svc)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+    int new_credit, credit;
 
     BUG_ON( __vcpu_on_runq(svc) );
     BUG_ON( cpu != svc->vcpu->processor );
 
+    /* HV */
+    if (svc->hard_virt_pcpu_state_change)
+    {
+       svc->hard_virt_pcpu_state_change = 0;
+       if (atomic_read(&svc->hard_virt_pcpu))
+          svc->pri = CSCHED_PRI_RR;
+       else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0)
+       {
+           if (atomic_read(&svc->credit) > 0)
+               svc->pri = CSCHED_PRI_TS_UNDER;
+           else
+               svc->pri = CSCHED_PRI_TS_OVER;
+       }
+    }
+    if (svc->vcpu->domain->domain_id == 0)
+       NUMBER_DOM0_VCPUS_PRESENT(cpu)++;
+
+    new_credit = atomic_read(&svc->credit);
+
     list_for_each( iter, runq )
     {
         const struct csched_vcpu * const iter_svc = __runq_elem(iter);
         if ( svc->pri > iter_svc->pri )
+           break;
+        credit = atomic_read(&iter_svc->credit);
+        if ( (svc->pri == iter_svc->pri && new_credit >= credit))
             break;
     }
 
@@ -294,6 +401,24 @@ __runq_remove(struct csched_vcpu *svc)
 {
     BUG_ON( !__vcpu_on_runq(svc) );
     list_del_init(&svc->runq_elem);
+
+    /* HV */
+    if (svc->vcpu->domain->domain_id == 0)
+        NUMBER_DOM0_VCPUS_PRESENT(svc->vcpu->processor)--;
+
+    if (svc->hard_virt_pcpu_state_change)
+    {
+       svc->hard_virt_pcpu_state_change = 0;
+       if (atomic_read(&svc->hard_virt_pcpu))
+          svc->pri = CSCHED_PRI_RR;
+       else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0 )
+       {
+           if (atomic_read(&svc->credit) > 0)
+               svc->pri = CSCHED_PRI_TS_UNDER;
+           else
+               svc->pri = CSCHED_PRI_TS_OVER;
+       }
+    }
 }
 
 static inline void
@@ -302,12 +427,18 @@ __runq_tickle(unsigned int cpu, struct c
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
     cpumask_t mask;
+    int newcredit, curcredit;
 
     ASSERT(cur);
     cpus_clear(mask);
 
     /* If strictly higher priority than current VCPU, signal the CPU */
-    if ( new->pri > cur->pri )
+    newcredit = atomic_read(&new->credit);
+    curcredit = atomic_read(&cur->credit);
+    /* HV */
+    if ((opt_hardvirt && new->vcpu->domain->domain_id == 0) ||
+	(new->pri > cur->pri) ||
+	(new->pri == cur->pri && newcredit > curcredit && newcredit > -(CSCHED_CREDITS_PER_TSLICE>>3)) )
     {
         if ( cur->pri == CSCHED_PRI_IDLE )
             CSCHED_STAT_CRANK(tickle_local_idler);
@@ -339,6 +470,18 @@ __runq_tickle(unsigned int cpu, struct c
         }
     }
 
+    /* HV - Small chance of false positive in hard_virt_none map here */
+    if ( cur->pri == CSCHED_PRI_RR && new->pri == CSCHED_PRI_RR )
+    {
+       cpu_set(cpu, csched_priv.hard_virt_multiple);
+       if ( ! cpus_empty(csched_priv.hard_virt_none) )
+       {
+           CSCHED_STAT_CRANK(tickle_hard_virt_none);
+           cpus_or(mask, mask, csched_priv.hard_virt_none);
+           cpus_and(mask, mask, new->vcpu->cpu_affinity);
+       }
+    }
+
     /* Send scheduler interrupts to designated CPUs */
     if ( !cpus_empty(mask) )
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
@@ -367,11 +510,14 @@ csched_pcpu_init(int cpu)
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
     INIT_LIST_HEAD(&spc->runq);
     spc->runq_sort_last = csched_priv.runq_sort;
+    spc->number_of_dom0_vcpus_present = 0;
+    spc->unused = 0; /* HV */
     per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
     cpu_set(cpu, csched_priv.idlers);
+    cpu_set(cpu, csched_priv.hard_virt_none); /* HV */
 
     spin_unlock_irqrestore(&csched_priv.lock, flags);
 
@@ -464,6 +610,20 @@ csched_cpu_pick(struct vcpu *vc)
         }
         else
         {
+            /* Hmm.. This is of questionable value..
+             * There are many cases where Vcpus are better off
+             * being on the same socket due to effective L2 sharing
+             * and low impact of cache bouncing.
+             * In the absence of any other workload, moving the Vcpus
+             * to different cores will be useful transiently but when
+             * the system gets busy since there is no mechanism to assert
+             * socket level affinities, it will be a hit on the performance.
+             * NUMA smartness has also gone for a toss here.
+             *
+             * Eventually we would want to allocate memory for Virts from
+             * local NUMA nodes in which case NUMA affinities need to
+             * implemented by the scheduler and this section
+             * needs to be thrown out  */
             ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
             cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
             cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
@@ -533,22 +693,23 @@ csched_vcpu_acct(unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
 
+    int credit;
+    /* Update credits */
+    credit = atomic_read(&svc->credit);
+
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
 
     /*
-     * If this VCPU's priority was boosted when it last awoke, reset it.
-     * If the VCPU is found here, then it's consuming a non-negligeable
-     * amount of CPU resources and should no longer be boosted.
-     */
-    if ( svc->pri == CSCHED_PRI_TS_BOOST )
-        svc->pri = CSCHED_PRI_TS_UNDER;
-
-    /*
      * Update credits
      */
     atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
 
+    if ( credit < CSCHED_CREDITS_PER_TICK && svc->pri ==CSCHED_PRI_TS_UNDER )
+    {
+         svc->pri = CSCHED_PRI_TS_OVER;
+    }
+
     /*
      * Put this VCPU and domain back on the active list if it was
      * idling.
@@ -594,6 +755,14 @@ csched_vcpu_init(struct vcpu *vc)
     CSCHED_VCPU_STATS_RESET(svc);
     vc->sched_priv = svc;
 
+    /* HV */
+    if (opt_hardvirt && vc->domain->domain_id == 0 && !is_idle_vcpu(vc))
+	svc->pri = CSCHED_PRI_RR;
+    svc->credit_real_incr = 0;
+    atomic_set(&svc->hard_virt_pcpu, 0); /* HV */
+    svc->hard_virt_pcpu_state_change = 0;
+    svc->unused = 0;
+
     /* Allocate per-PCPU info */
     if ( unlikely(!CSCHED_PCPU(vc->processor)) )
     {
@@ -617,6 +786,16 @@ csched_vcpu_destroy(struct vcpu *vc)
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
+    /* HV */
+    spin_lock(&csched_priv.hard_virt_lock);
+    if (atomic_read(&svc->hard_virt_pcpu))
+    {
+         atomic_set(&svc->hard_virt_pcpu, 0);
+         svc->hard_virt_pcpu_state_change=1;
+         total_hard_virts--;
+    }
+    spin_unlock(&csched_priv.hard_virt_lock);
+
     spin_lock_irqsave(&csched_priv.lock, flags);
 
     if ( !list_empty(&svc->active_vcpu_elem) )
@@ -666,37 +845,32 @@ csched_vcpu_wake(struct vcpu *vc)
     else
         CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
 
-    /*
-     * We temporarly boost the priority of awaking VCPUs!
-     *
-     * If this VCPU consumes a non negligeable amount of CPU, it
-     * will eventually find itself in the credit accounting code
-     * path where its priority will be reset to normal.
-     *
-     * If on the other hand the VCPU consumes little CPU and is
-     * blocking and awoken a lot (doing I/O for example), its
-     * priority will remain boosted, optimizing it's wake-to-run
-     * latencies.
-     *
-     * This allows wake-to-run latency sensitive VCPUs to preempt
-     * more CPU resource intensive VCPUs without impacting overall 
-     * system fairness.
-     *
-     * The one exception is for VCPUs of capped domains unpausing
-     * after earning credits they had overspent. We don't boost
-     * those.
-     */
-    if ( svc->pri == CSCHED_PRI_TS_UNDER &&
-         !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
-    {
-        svc->pri = CSCHED_PRI_TS_BOOST;
-    }
-
     /* Put the VCPU on the runq and tickle CPUs */
-    __runq_insert(cpu, svc);
+    __runq_insert_special(cpu, svc);
     __runq_tickle(cpu, svc);
 }
 
+/* HV - Count up all vcpus including offline ones */
+static unsigned int find_vcpu_count(struct domain *d)
+{
+    struct vcpu *v;
+    unsigned int vcpu_count=0;
+    for_each_vcpu(d, v)
+        vcpu_count++;
+    return vcpu_count;
+}
+
+/* HV - Only online pcpus are considered as valid HV target */
+static unsigned int find_available_online_cpus(unsigned int max_cpus)
+{
+    int cpu;
+    unsigned int pcpu_count=0;
+
+    for_each_online_cpu ( cpu )
+       pcpu_count++;
+    return pcpu_count - total_hard_virts;
+}
+
 static int
 csched_dom_cntl(
     struct domain *d,
@@ -705,15 +879,96 @@ csched_dom_cntl(
     struct csched_dom * const sdom = CSCHED_DOM(d);
     unsigned long flags;
 
+    /* HV */
+    unsigned short hard_virt, vcpu;
+    unsigned int vcpus_in_domain, hard_cpus_available;
+
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
     {
-        op->u.credit.weight = sdom->weight;
+        /* HV */
+        op->u.credit.weight = sdom->weight + (atomic_read(&d->hard_virt) << 15) ;
         op->u.credit.cap = sdom->cap;
     }
     else
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
 
+        /* HV */
+        hard_virt = (op->u.credit.weight >> 15) & 0x1;
+        op->u.credit.weight &= 0x7fff;
+
+        if (hard_virt != atomic_read(&d->hard_virt))
+        {
+           if (!hard_virt)
+           {
+               /* This will convert a hard-virt to virt - This really shouldn't fail */
+               printk("Taking down hard-virt %u\n", d->domain_id);
+               spin_lock(&csched_priv.hard_virt_lock);
+               for (vcpu=0; vcpu < MAX_VIRT_CPUS; vcpu++)
+               {
+                  if (d->vcpu[vcpu] == NULL)
+                     break;
+                  if ( atomic_read( &(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu) ) )
+                  {
+                         atomic_set(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu), 0);
+                         CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu_state_change = 1;
+                  }
+                  total_hard_virts--;
+               }
+               atomic_set(&d->hard_virt, 0);
+               spin_unlock(&csched_priv.hard_virt_lock);
+               if (total_hard_virts < 0){
+                  printk("total_hard_virts less than 0!!\n");
+                  total_hard_virts = 0;
+               }
+           }
+           else
+           {
+               /* This will convert the virt into a hard-virt - If this fails,
+                * the entire operation fails
+                */
+               /* Hard Virt conversion is made atomic with respect to hardvirt
+                * destruction code path using a spinlock
+                */
+               printk("Creating Hard-Virt %u\n", d->domain_id);
+               if (sdom->cap != 0U)
+               {
+                   return -0xDEAD;
+               }
+               if (d->domain_id == 0)
+               {
+                   return -0xDEAD;
+               }
+
+               spin_lock(&csched_priv.hard_virt_lock);
+               vcpus_in_domain = find_vcpu_count(d);
+               hard_cpus_available = find_available_online_cpus(vcpus_in_domain);
+               printk("to convert %d - available %d \n", vcpus_in_domain, hard_cpus_available);
+               if (vcpus_in_domain > hard_cpus_available)
+               {
+                   spin_unlock(&csched_priv.hard_virt_lock);
+                   return -0xDEAD;
+               }
+               atomic_set(&d->hard_virt, 1);
+               for (vcpu=0; vcpu < MAX_VIRT_CPUS; vcpu++)
+               {
+                  if (d->vcpu[vcpu] == NULL)
+                     break;
+                  if ( atomic_read(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu)) )
+                  {
+                     spin_unlock(&csched_priv.hard_virt_lock);
+                     printk("Vcpu %d already has a pcpu assigned - Aborting half way through.. \n", vcpu);
+                     atomic_set(&d->hard_virt, 0);
+                     return -0xDEAD;
+                  }
+                  atomic_set(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu), 1);
+                  CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu_state_change = 1;
+                  total_hard_virts++;
+               }
+               spin_unlock(&csched_priv.hard_virt_lock);
+           }
+        }
+
         spin_lock_irqsave(&csched_priv.lock, flags);
 
         if ( op->u.credit.weight != 0 )
@@ -726,7 +981,7 @@ csched_dom_cntl(
             sdom->weight = op->u.credit.weight;
         }
 
-        if ( op->u.credit.cap != (uint16_t)~0U )
+        if ( op->u.credit.cap != (uint16_t)~0U && !atomic_read(&d->hard_virt) )
             sdom->cap = op->u.credit.cap;
 
         spin_unlock_irqrestore(&csched_priv.lock, flags);
@@ -783,6 +1038,7 @@ csched_runq_sort(unsigned int cpu)
     struct csched_vcpu *svc_elem;
     unsigned long flags;
     int sort_epoch;
+    int credit;
 
     sort_epoch = csched_priv.runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
@@ -801,7 +1057,32 @@ csched_runq_sort(unsigned int cpu)
         next = elem->next;
         svc_elem = __runq_elem(elem);
 
-        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
+        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER || svc_elem->pri == CSCHED_PRI_RR )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+
+        elem = next;
+    }
+
+    elem = runq->next;
+    last_under = runq;
+
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+        if (svc_elem->pri != CSCHED_PRI_TS_UNDER && svc_elem->pri != CSCHED_PRI_RR)
+            break;
+        credit = atomic_read (&svc_elem->credit);
+
+        if ( credit >= CSCHED_CREDITS_PER_TSLICE/2 )
         {
             /* does elem need to move up the runq? */
             if ( elem->prev != last_under )
@@ -814,6 +1095,31 @@ csched_runq_sort(unsigned int cpu)
 
         elem = next;
     }
+    /* HV - TODO - This sucks - 3 scans !! - Old-fashioned bubble sort is
+          likely to be no worse in most cases - Consider a rewrite */
+    elem = runq->next;
+    last_under = runq;
+
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+        if (svc_elem->pri != CSCHED_PRI_TS_UNDER && svc_elem->pri != CSCHED_PRI_RR)
+            break;
+        if ( svc_elem->pri == CSCHED_PRI_RR )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+
+        elem = next;
+    }
+
 
     spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
 }
@@ -835,6 +1141,8 @@ csched_acct(void)
     int credit_balance;
     int credit_xtra;
     int credit;
+    uint32_t max_credit;
+    int credit_prev, credit_real_incr;
 
 
     spin_lock_irqsave(&csched_priv.lock, flags);
@@ -945,8 +1253,34 @@ csched_acct(void)
             BUG_ON( sdom != svc->sdom );
 
             /* Increment credit */
-            atomic_add(credit_fair, &svc->credit);
             credit = atomic_read(&svc->credit);
+            credit_prev = credit;
+            credit_real_incr = svc->credit_real_incr;
+
+            if (credit <= 0)
+                credit += credit_fair;
+            else
+            {
+                if ( sdom->cap != 0U )
+                {
+                   if (!vcpu_runnable(svc->vcpu))
+                   {
+                        credit = credit/2;
+                        if (credit > credit_fair/2)
+                             credit = credit_fair/2;
+                   }
+                }
+                /* If this earned fair share of credits last time
+                     then allow rollover credits */
+                if ( credit_real_incr > credit_fair )
+                {
+                      credit -= credit_real_incr - credit_fair;
+                      if (credit < 0)
+                           credit = 0;
+                }
+                credit += credit_fair;
+            }
+            atomic_set(&svc->credit, credit);
 
             /*
              * Recompute priority or, if VCPU is idling, remove it from
@@ -954,29 +1288,33 @@ csched_acct(void)
              */
             if ( credit < 0 )
             {
-                svc->pri = CSCHED_PRI_TS_OVER;
-
-                /* Park running VCPUs of capped-out domains */
-                if ( sdom->cap != 0U &&
-                     credit < -credit_cap &&
-                     !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+                if (svc->pri != CSCHED_PRI_RR)
                 {
-                    CSCHED_STAT_CRANK(vcpu_park);
-                    vcpu_pause_nosync(svc->vcpu);
-                    svc->flags |= CSCHED_FLAG_VCPU_PARKED;
-                }
+       		     svc->pri = CSCHED_PRI_TS_OVER;
+
+                     /* Park running VCPUs of capped-out domains */
+                     if ( sdom->cap != 0U &&
+                         credit < -credit_cap &&
+                         !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+                     {
+                     CSCHED_STAT_CRANK(vcpu_park);
+                     vcpu_pause_nosync(svc->vcpu);
+                     svc->flags |= CSCHED_FLAG_VCPU_PARKED;
+                     }
+                 }
 
                 /* Lower bound on credits */
-                if ( credit < -CSCHED_CREDITS_PER_TSLICE )
+                if ( credit < -(CSCHED_CREDITS_PER_TSLICE<<1) )
                 {
                     CSCHED_STAT_CRANK(acct_min_credit);
-                    credit = -CSCHED_CREDITS_PER_TSLICE;
+                    credit = -(CSCHED_CREDITS_PER_TSLICE<<1);
                     atomic_set(&svc->credit, credit);
                 }
-            }
+	    }
             else
             {
-                svc->pri = CSCHED_PRI_TS_UNDER;
+                if (svc->pri != CSCHED_PRI_RR)
+			svc->pri = CSCHED_PRI_TS_UNDER;
 
                 /* Unpark any capped domains whose credits go positive */
                 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
@@ -992,17 +1330,25 @@ csched_acct(void)
                 }
 
                 /* Upper bound on credits means VCPU stops earning */
-                if ( credit > CSCHED_CREDITS_PER_TSLICE )
-                {
+                max_credit = (credit_fair << 1) + credit_fair;
+                if (max_credit > 3*CSCHED_CREDITS_PER_TSLICE/2)
+                    max_credit = 3*CSCHED_CREDITS_PER_TSLICE/2;
+                else if (max_credit < CSCHED_CREDITS_PER_TSLICE/2)
+                    max_credit = CSCHED_CREDITS_PER_TSLICE/2;
+                if ( credit > max_credit ){
+                    credit = max_credit;
                     __csched_vcpu_acct_stop_locked(svc);
-                    credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
+
             }
 
             CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
             CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
+            svc->credit_real_incr = credit - credit_prev;
             credit_balance += credit;
+            if (credit_fair > svc->credit_real_incr)
+                credit_total += credit_fair - svc->credit_real_incr;
         }
     }
 
@@ -1048,18 +1394,21 @@ csched_tick(void *_cpu)
      * once per accounting period (currently 30 milliseconds).
      */
     csched_runq_sort(cpu);
+    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
 
 static struct csched_vcpu *
-csched_runq_steal(int peer_cpu, int cpu, int pri)
+csched_runq_steal(int peer_cpu, int cpu, int pri, int credit)
 {
     const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
     const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
     struct csched_vcpu *speer;
     struct list_head *iter;
     struct vcpu *vc;
+    int speer_credit;
+
 
     /*
      * Don't steal from an idle CPU's runq because it's about to
@@ -1075,8 +1424,10 @@ csched_runq_steal(int peer_cpu, int cpu,
              * If next available VCPU here is not of strictly higher
              * priority than ours, this PCPU is useless to us.
              */
-            if ( speer->pri <= pri )
-                break;
+            speer_credit = atomic_read(&speer->credit);
+            if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri < pri
+                || (speer->pri == pri && speer_credit <= (credit+(CSCHED_CREDITS_PER_TSLICE>>3)) ) )
+                 break;
 
             /* Is this VCPU is runnable on our PCPU? */
             vc = speer->vcpu;
@@ -1099,11 +1450,12 @@ csched_runq_steal(int peer_cpu, int cpu,
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(int cpu, struct csched_vcpu *snext, int credit)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
     int peer_cpu;
+    int repeat_count = 15, lock_failure_flag = 0;
 
     BUG_ON( cpu != snext->vcpu->processor );
 
@@ -1114,6 +1466,7 @@ csched_load_balance(int cpu, struct csch
     else
         CSCHED_STAT_CRANK(load_balance_other);
 
+  spinLockRetry:
     /*
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
@@ -1137,23 +1490,154 @@ csched_load_balance(int cpu, struct csch
         if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
         {
             CSCHED_STAT_CRANK(steal_trylock_failed);
+            lock_failure_flag = 1;
             continue;
         }
 
         /*
          * Any work over there to steal?
          */
-        speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
+        speer = csched_runq_steal(peer_cpu, cpu, snext->pri, credit);
         spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
         if ( speer != NULL )
             return speer;
     }
 
+    if ( opt_hardvirt && lock_failure_flag && snext->pri == CSCHED_PRI_IDLE && repeat_count > 1 )
+    {
+        lock_failure_flag = 0;
+        repeat_count--;
+        goto spinLockRetry;
+     }
+
     /* Failed to find more important work elsewhere... */
     __runq_remove(snext);
     return snext;
 }
 
+static struct csched_vcpu *
+csched_runq_rr_steal(int peer_cpu, int cpu)
+{
+    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
+    const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
+    struct csched_vcpu *speer;
+    struct list_head *iter;
+    struct vcpu *vc;
+
+
+    /*
+     * Don't steal from an idle CPU's runq because it's about to
+     * pick up work from it itself.
+     */
+    if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
+    {
+        list_for_each( iter, &peer_pcpu->runq )
+        {
+            speer = __runq_elem(iter);
+
+            /** If next available VCPU here is not of strictly higher
+             * priority than ours, this PCPU is useless to us.
+             */
+            if ( speer->pri < CSCHED_PRI_RR )
+                 break;
+
+            /* Is this VCPU is runnable on our PCPU? */
+            vc = speer->vcpu;
+            BUG_ON( is_idle_vcpu(vc) );
+
+            if (__csched_vcpu_is_migrateable(vc, cpu))
+            {
+                /* We got a candidate. Grab it! */
+                CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+                CSCHED_STAT_CRANK(migrate_queued);
+                __runq_remove(speer);
+                vc->processor = cpu;
+                return speer;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+static struct csched_vcpu *
+csched_rr_load_balance(int cpu, struct csched_vcpu *snext)
+{
+    struct csched_vcpu *speer;
+    cpumask_t workers;
+    int peer_cpu;
+    int repeat_count = 15, lock_failure_flag = 0;
+
+    BUG_ON( cpu != snext->vcpu->processor );
+
+  spinLockRetry:
+
+    cpus_and(workers, cpu_online_map, cpu_online_map);
+    cpu_clear(cpu, workers);
+    peer_cpu = cpu;
+
+    while ( !cpus_empty(workers) )
+    {
+        peer_cpu = __cycle_cpu(peer_cpu, &workers);
+        cpu_clear(peer_cpu, workers);
+
+        /*
+         * Get ahold of the scheduler lock for this peer CPU.
+         *
+         * Note: We don't spin on this lock but simply try it. Spinning could
+         * cause a deadlock if the peer CPU is also load balancing and trying
+         * to lock this CPU.
+         */
+        if ( !cpu_isset(peer_cpu, csched_priv.hard_virt_multiple))
+            continue;
+
+        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
+        {
+            CSCHED_STAT_CRANK(rt_steal_trylock_failed);
+            lock_failure_flag = 1;
+            continue;
+        }
+
+        /*
+         * Any work over there to steal?
+         */
+        speer = csched_runq_rr_steal(peer_cpu, cpu);
+        spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+        if ( speer != NULL )
+        {
+            CSCHED_STAT_CRANK(rt_vcpu_migrate);
+            return speer;
+        }
+    }
+
+    if ( lock_failure_flag && snext->pri < CSCHED_PRI_RR && repeat_count > 1 )
+    {
+        lock_failure_flag = 0;
+        repeat_count--;
+        goto spinLockRetry;
+     }
+
+    /* Failed to find more important work elsewhere... */
+    __runq_remove(snext);
+    return snext;
+}
+
+static struct csched_vcpu * __runq_find_dom0_vcpu(int cpu)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+
+    list_for_each( iter, runq )
+    {
+        struct csched_vcpu * iter_svc = __runq_elem(iter);
+        if (iter_svc->pri <= CSCHED_PRI_IDLE)
+            break;
+        if (iter_svc->vcpu->domain->domain_id == 0)
+            return iter_svc;
+    }
+   return NULL;
+}
+
 /*
  * This function is in the critical path. It is designed to be simple and
  * fast for the common case.
@@ -1166,6 +1650,8 @@ csched_schedule(s_time_t now)
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
     struct csched_vcpu *snext;
     struct task_slice ret;
+    int credit;
+    struct csched_vcpu *temp_snext;
 
     CSCHED_STAT_CRANK(schedule);
     CSCHED_VCPU_CHECK(current);
@@ -1173,11 +1659,26 @@ csched_schedule(s_time_t now)
     /*
      * Select next runnable local VCPU (ie top of local runq)
      */
+    if (opt_hardvirt && current->domain->domain_id == 0 && vcpu_runnable(current))
+    {
+        snext = scurr;
+        goto dom0_bypass;
+    }
+
     if ( vcpu_runnable(current) )
         __runq_insert(cpu, scurr);
     else
         BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
 
+    if (opt_hardvirt && NUMBER_DOM0_VCPUS_PRESENT(cpu) > 0)
+    {
+        snext = __runq_find_dom0_vcpu(cpu);
+        if (snext){
+             __runq_remove(snext);
+             goto dom0_bypass;
+        }
+    }
+
     snext = __runq_elem(runq->next);
 
     /*
@@ -1188,10 +1689,32 @@ csched_schedule(s_time_t now)
      * urgent work... If not, csched_load_balance() will return snext, but
      * already removed from the runq.
      */
-    if ( snext->pri > CSCHED_PRI_TS_OVER )
+    /* HV - hard_virt_multiple might report false positive if a RR vcpu was
+     * put to sleep when it was in the runq or migrated off- Acceptable
+     * tradeoff for overhead of updating maps at sleep/wakeup points.
+     * Since hard_virt_multiple for self isn't updated at this point, there is
+     * a very small chance of false positive from self
+     */
+    if ( snext->pri < CSCHED_PRI_RR && !cpus_empty(csched_priv.hard_virt_multiple) )
+    {
+        CSCHED_STAT_CRANK(rt_imbalance);
+        temp_snext = csched_rr_load_balance(cpu, snext);
+        if (temp_snext){
+             snext = temp_snext;
+             goto dom0_bypass;
+        }
+    }
+
+    credit = atomic_read(&snext->credit);
+    if ( snext->pri > CSCHED_PRI_TS_OVER && credit > (CSCHED_CREDITS_PER_TSLICE >> 2))
         __runq_remove(snext);
-    else
-        snext = csched_load_balance(cpu, snext);
+    else{
+        if (snext->pri <= CSCHED_PRI_IDLE)
+              credit = -(CSCHED_CREDITS_PER_TSLICE<<1);
+        snext = csched_load_balance(cpu, snext, credit);
+    }
+
+  dom0_bypass:
 
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
@@ -1206,6 +1729,22 @@ csched_schedule(s_time_t now)
     {
         cpu_clear(cpu, csched_priv.idlers);
     }
+    if ( snext->pri == CSCHED_PRI_RR )
+    {
+        if ( cpu_isset(cpu, csched_priv.hard_virt_none) )
+             cpu_clear(cpu, csched_priv.hard_virt_none);
+        if (!list_empty(runq) && __runq_elem(runq->next)->pri == CSCHED_PRI_RR)
+             cpu_set(cpu, csched_priv.hard_virt_multiple);
+        else
+             cpu_clear(cpu, csched_priv.hard_virt_multiple);
+    }
+    else
+    {
+        if (!cpu_isset(cpu, csched_priv.hard_virt_none))
+             cpu_set(cpu, csched_priv.hard_virt_none);
+        if (cpu_isset(cpu, csched_priv.hard_virt_multiple))
+             cpu_clear(cpu, csched_priv.hard_virt_multiple);
+    }
 
     /*
      * Return task to run next...
@@ -1231,7 +1770,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 
     if ( sdom )
     {
-        printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
+        printk(" credit=%i of %d [w=%u]", atomic_read(&svc->credit), svc->credit_real_incr, sdom->weight);
 #ifdef CSCHED_STATS
         printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
                 svc->stats.credit_last,
@@ -1257,10 +1796,11 @@ csched_dump_pcpu(int cpu)
     spc = CSCHED_PCPU(cpu);
     runq = &spc->runq;
 
-    printk(" sort=%d, sibling=0x%lx, core=0x%lx\n",
+    printk(" sort=%d, sibling=0x%lx, core=0x%lx dom0=%u\n",
             spc->runq_sort_last,
             cpu_sibling_map[cpu].bits[0],
-            cpu_core_map[cpu].bits[0]);
+            cpu_core_map[cpu].bits[0],
+            NUMBER_DOM0_VCPUS_PRESENT(cpu));
 
     /* current VCPU */
     svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
@@ -1313,6 +1853,8 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT);
 
     printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
+    printk("hard_virt_none: 0x%lx\n", csched_priv.hard_virt_none.bits[0]);
+    printk("hard_virt_multiple: 0x%lx\n", csched_priv.hard_virt_multiple.bits[0]);
 
     CSCHED_STATS_PRINTK();
 
@@ -1346,6 +1888,9 @@ csched_init(void)
     csched_priv.credit = 0U;
     csched_priv.credit_balance = 0;
     csched_priv.runq_sort = 0U;
+    spin_lock_init(&csched_priv.hard_virt_lock); /* HV */
+    cpus_clear(csched_priv.hard_virt_none);
+    cpus_clear(csched_priv.hard_virt_multiple);
     CSCHED_STATS_RESET();
 }
 
diff -Naurp xen/common/schedule.c xen-redhat/common/schedule.c
--- xen/common/schedule.c
+++ xen-redhat/common/schedule.c
@@ -37,10 +37,6 @@
 static char opt_sched[10] = "credit";
 string_param("sched", opt_sched);
 
-/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
-static unsigned int opt_dom0_vcpus_pin;
-boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
-
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
 
 /* Various timer handlers. */
@@ -105,7 +101,7 @@ int sched_init_vcpu(struct vcpu *v, unsi
      * domain-0 VCPUs, are pinned onto their respective physical CPUs.
      */
     v->processor = processor;
-    if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) )
+    if ( is_idle_domain(d) || d->is_pinned )
         v->cpu_affinity = cpumask_of_cpu(processor);
     else
         cpus_setall(v->cpu_affinity);
@@ -250,12 +246,11 @@ void vcpu_force_reschedule(struct vcpu *
     }
 }
 
-int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
+static int __vcpu_set_affinity(
+    struct vcpu *v, cpumask_t *affinity,
+    bool_t old_lock_status, bool_t new_lock_status)
 {
-    cpumask_t online_affinity;
-
-    if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin )
-        return -EINVAL;
+    cpumask_t online_affinity, old_affinity;
 
     cpus_and(online_affinity, *affinity, cpu_online_map);
     if ( cpus_empty(online_affinity) )
@@ -263,7 +258,18 @@ int vcpu_set_affinity(struct vcpu *v, cp
 
     vcpu_schedule_lock_irq(v);
 
+    if ( v->affinity_locked != old_lock_status )
+    {
+        BUG_ON(!v->affinity_locked);
+        vcpu_schedule_unlock_irq(v);
+        return -EBUSY;
+    }
+
+    v->affinity_locked = new_lock_status;
+
+    old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
+    *affinity = old_affinity;
     if ( !cpu_isset(v->processor, v->cpu_affinity) )
         set_bit(_VPF_migrating, &v->pause_flags);
 
@@ -278,6 +284,31 @@ int vcpu_set_affinity(struct vcpu *v, cp
     return 0;
 }
 
+int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
+{
+    if ( v->domain->is_pinned )
+        return -EINVAL;
+    return __vcpu_set_affinity(v, affinity, 0, 0);
+}
+
+int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
+{
+    return __vcpu_set_affinity(v, affinity, 0, 1);
+}
+
+void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
+{
+    cpumask_t online_affinity;
+
+    /* Do not fail if no CPU in old affinity mask is online. */
+    cpus_and(online_affinity, *affinity, cpu_online_map);
+    if ( cpus_empty(online_affinity) )
+        *affinity = cpu_online_map;
+
+    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
+        BUG();
+}
+
 /* Block the currently-executing domain until a pertinent event occurs. */
 static long do_block(void)
 {
diff -Naurp xen/common/sysctl.c xen-redhat/common/sysctl.c
--- xen/common/sysctl.c
+++ xen-redhat/common/sysctl.c
@@ -21,6 +21,9 @@
 #include <xen/keyhandler.h>
 #include <asm/current.h>
 #include <public/sysctl.h>
+#include <asm/numa.h>
+#include <xen/nodemask.h>
+
 
 extern long arch_do_sysctl(
     struct xen_sysctl *op, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl);
@@ -38,7 +41,17 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
         return -EFAULT;
 
     if ( op->interface_version != XEN_SYSCTL_INTERFACE_VERSION )
-        return -EACCES;
+    {
+        /*
+         * RHEL5 ABI compat: Allow through physinfo calls with
+         * newer versions for NUMA extensions
+         */
+        if (op->cmd == XEN_SYSCTL_physinfo &&
+            op->interface_version == (XEN_SYSCTL_INTERFACE_VERSION+1))
+            dprintk(XENLOG_DEBUG, "Allowing physinfo call with newer ABI version\n");
+        else
+            return -EACCES;
+    }
 
     spin_lock(&sysctl_lock);
 
@@ -112,6 +125,18 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
     }
     break;
 
+    case XEN_SYSCTL_availheap:
+    {
+        op->u.availheap.avail_bytes = avail_domheap_pages_region(
+            op->u.availheap.node,
+            op->u.availheap.min_bitwidth,
+            op->u.availheap.max_bitwidth);
+        op->u.availheap.avail_bytes <<= PAGE_SHIFT;
+
+        ret = copy_to_guest(u_sysctl, op, 1) ? -EFAULT : 0;
+    }
+    break;
+
 #ifdef PERF_COUNTERS
     case XEN_SYSCTL_perfc_op:
     {
diff -Naurp xen/common/trace.c xen-redhat/common/trace.c
--- xen/common/trace.c
+++ xen-redhat/common/trace.c
@@ -37,7 +37,7 @@
 #define xen_t_buf t_buf
 CHECK_t_buf;
 #undef xen_t_buf
-#define TB_COMPAT IS_COMPAT(dom0)
+#define TB_COMPAT is_pv_32on64_domain(dom0)
 #else
 #define compat_t_rec t_rec
 #define TB_COMPAT 0
diff -Naurp xen/common/xenoprof.c xen-redhat/common/xenoprof.c
--- xen/common/xenoprof.c
+++ xen-redhat/common/xenoprof.c
@@ -171,7 +171,7 @@ static int alloc_xenoprof_struct(
     bufsize = sizeof(struct xenoprof_buf);
     i = sizeof(struct event_log);
 #ifdef CONFIG_COMPAT
-    d->xenoprof->is_compat = IS_COMPAT(is_passive ? dom0 : d);
+    d->xenoprof->is_compat = is_pv_32on64_domain(is_passive ? dom0 : d);
     if ( XENOPROF_COMPAT(d->xenoprof) )
     {
         bufsize = sizeof(struct compat_oprof_buf);
diff -Naurp xen/drivers/acpi/tables.c xen-redhat/drivers/acpi/tables.c
--- xen/drivers/acpi/tables.c
+++ xen-redhat/drivers/acpi/tables.c
@@ -59,6 +59,8 @@ static char *acpi_table_signatures[ACPI_
 	[ACPI_SPMI] = "SPMI",
 	[ACPI_HPET] = "HPET",
 	[ACPI_MCFG] = "MCFG",
+	[ACPI_DMAR] = "DMAR",
+	[ACPI_IVRS] = "IVRS",
 };
 
 static char *mps_inti_flags_polarity[] = { "dfl", "high", "res", "low" };
diff -Naurp xen/drivers/char/console.c xen-redhat/drivers/char/console.c
--- xen/drivers/char/console.c
+++ xen-redhat/drivers/char/console.c
@@ -313,18 +313,16 @@ static long guest_console_write(XEN_GUES
 
     while ( count > 0 )
     {
-        while ( serial_tx_space(sercon_handle) < (SERIAL_TXBUFSZ / 2) )
-        {
-            if ( hypercall_preempt_check() )
-                break;
-            cpu_relax();
-        }
-
         if ( hypercall_preempt_check() )
             return hypercall_create_continuation(
                 __HYPERVISOR_console_io, "iih",
                 CONSOLEIO_write, count, buffer);
 
+	if ( serial_tx_space(sercon_handle) < (SERIAL_TXBUFSZ / 2) )
+	{
+	    return 0;
+	}
+
         kcount = min_t(int, count, sizeof(kbuf)-1);
         if ( copy_from_guest(kbuf, buffer, kcount) )
             return -EFAULT;
@@ -587,16 +585,6 @@ void __init console_endboot(void)
     switch_serial_input();
 }
 
-void console_start_log_everything(void)
-{
-    atomic_inc(&print_everything);
-}
-
-void console_end_log_everything(void)
-{
-    atomic_dec(&print_everything);
-}
-
 void console_force_unlock(void)
 {
     spin_lock_init(&console_lock);
@@ -611,14 +599,14 @@ void console_force_lock(void)
 
 void console_start_sync(void)
 {
-    console_start_log_everything();
+    atomic_inc(&print_everything);
     serial_start_sync(sercon_handle);
 }
 
 void console_end_sync(void)
 {
     serial_end_sync(sercon_handle);
-    console_end_log_everything();
+    atomic_dec(&print_everything);
 }
 
 void console_putc(char c)
diff -Naurp xen/drivers/char/serial.c xen-redhat/drivers/char/serial.c
--- xen/drivers/char/serial.c
+++ xen-redhat/drivers/char/serial.c
@@ -3,7 +3,7 @@
  * 
  * Framework for serial device drivers.
  * 
- * Copyright (c) 2003-2005, K A Fraser
+ * Copyright (c) 2003-2008, K A Fraser
  */
 
 #include <xen/config.h>
@@ -81,13 +81,21 @@ void serial_tx_interrupt(struct serial_p
 
 static void __serial_putc(struct serial_port *port, char c)
 {
-    int i;
-
     if ( (port->txbuf != NULL) && !port->sync )
     {
         /* Interrupt-driven (asynchronous) transmitter. */
+        if ( port->tx_quench )
+        {
+            /* Buffer filled and we are dropping characters. */
+            if ( (port->txbufp - port->txbufc) > (SERIAL_TXBUFSZ / 2) )
+                return;
+            port->tx_quench = 0;
+        }
+
         if ( (port->txbufp - port->txbufc) == SERIAL_TXBUFSZ )
         {
+#ifdef SERIAL_NEVER_DROP_CHARS
+            int i;
             /* Buffer is full: we spin, but could alternatively drop chars. */
             while ( !port->driver->tx_empty(port) )
                 cpu_relax();
@@ -95,6 +103,10 @@ static void __serial_putc(struct serial_
                 port->driver->putc(
                     port, port->txbuf[MASK_SERIAL_TXBUF_IDX(port->txbufc++)]);
             port->txbuf[MASK_SERIAL_TXBUF_IDX(port->txbufp++)] = c;
+#else
+            /* Buffer is full: drop characters until buffer is half empty. */
+            port->tx_quench = 1;
+#endif
         }
         else if ( ((port->txbufp - port->txbufc) == 0) &&
                   port->driver->tx_empty(port) )
diff -Naurp xen/drivers/Makefile xen-redhat/drivers/Makefile
--- xen/drivers/Makefile
+++ xen-redhat/drivers/Makefile
@@ -1,3 +1,6 @@
 subdir-y += char
+subdir-y += pci
+subdir-$(x86_32) += passthrough
+subdir-$(x86_64) += passthrough
 subdir-$(HAS_ACPI) += acpi
 subdir-$(HAS_VGA) += video
diff -Naurp xen/drivers/passthrough/amd/iommu_acpi.c xen-redhat/drivers/passthrough/amd/iommu_acpi.c
--- xen/drivers/passthrough/amd/iommu_acpi.c
+++ xen-redhat/drivers/passthrough/amd/iommu_acpi.c
@@ -0,0 +1,1041 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+#include <asm/hvm/svm/amd-iommu-acpi.h>
+
+extern unsigned long amd_iommu_page_entries;
+extern unsigned short ivrs_bdf_entries;
+extern struct ivrs_mappings *ivrs_mappings;
+extern unsigned short last_bdf;
+extern int ioapic_bdf[MAX_IO_APICS];
+unsigned int parse_ivrs_table_error;
+extern void *shared_intremap_table;
+
+static void add_ivrs_mapping_entry(
+            u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu)
+{
+    u8 sys_mgt, lint1_pass, lint0_pass, nmi_pass, ext_int_pass, init_pass;
+    ASSERT( ivrs_mappings != NULL );
+    
+    /* setup requestor id */
+    ivrs_mappings[bdf].dte_requestor_id = alias_id;
+
+    /* override flags for range of devices */
+    sys_mgt = get_field_from_byte(flags,
+                                  AMD_IOMMU_ACPI_SYS_MGT_MASK,
+                                  AMD_IOMMU_ACPI_SYS_MGT_SHIFT);
+    lint1_pass = get_field_from_byte(flags,
+                                     AMD_IOMMU_ACPI_LINT1_PASS_MASK,
+                                     AMD_IOMMU_ACPI_LINT1_PASS_SHIFT);
+    lint0_pass = get_field_from_byte(flags,
+                                     AMD_IOMMU_ACPI_LINT0_PASS_MASK,
+                                     AMD_IOMMU_ACPI_LINT0_PASS_SHIFT);
+    nmi_pass = get_field_from_byte(flags,
+                                   AMD_IOMMU_ACPI_NMI_PASS_MASK,
+                                   AMD_IOMMU_ACPI_NMI_PASS_SHIFT);
+    ext_int_pass = get_field_from_byte(flags,
+                                       AMD_IOMMU_ACPI_EINT_PASS_MASK,
+                                       AMD_IOMMU_ACPI_EINT_PASS_SHIFT);
+    init_pass = get_field_from_byte(flags,
+                                    AMD_IOMMU_ACPI_INIT_PASS_MASK,
+                                    AMD_IOMMU_ACPI_INIT_PASS_SHIFT);
+    
+    ivrs_mappings[bdf].dte_sys_mgt_enable = sys_mgt;
+    ivrs_mappings[bdf].dte_lint1_pass = lint1_pass;
+    ivrs_mappings[bdf].dte_lint0_pass = lint0_pass;
+    ivrs_mappings[bdf].dte_nmi_pass = nmi_pass;
+    ivrs_mappings[bdf].dte_ext_int_pass = ext_int_pass;
+    ivrs_mappings[bdf].dte_init_pass = init_pass;
+    
+    if (ivrs_mappings[alias_id].intremap_table == NULL )
+    {
+         /* allocate per-device interrupt remapping table */
+         if ( amd_iommu_perdev_intremap )
+             ivrs_mappings[alias_id].intremap_table =
+                amd_iommu_alloc_intremap_table();
+         else
+         {
+             if ( shared_intremap_table == NULL  )
+                 shared_intremap_table = amd_iommu_alloc_intremap_table();
+             ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
+         }
+    }
+    /* assgin iommu hardware */
+    ivrs_mappings[bdf].iommu = iommu;
+}
+
+
+static struct amd_iommu * __init find_iommu_from_bdf_cap(
+    u16 bdf, u8 cap_offset)
+{
+    struct amd_iommu *iommu;
+
+    for_each_amd_iommu ( iommu )
+        if ( (iommu->bdf == bdf) && (iommu->cap_offset == cap_offset) )
+            return iommu;
+
+    return NULL;
+}
+
+static void __init reserve_iommu_exclusion_range(
+    struct amd_iommu *iommu, uint64_t base, uint64_t limit)
+{
+    /* need to extend exclusion range? */
+    if ( iommu->exclusion_enable )
+    {
+        if ( iommu->exclusion_base < base )
+            base = iommu->exclusion_base;
+        if ( iommu->exclusion_limit > limit )
+            limit = iommu->exclusion_limit;
+    }
+
+    iommu->exclusion_enable = IOMMU_CONTROL_ENABLED;
+    iommu->exclusion_base = base;
+    iommu->exclusion_limit = limit;
+}
+
+static void __init reserve_iommu_exclusion_range_all(
+    struct amd_iommu *iommu,
+    unsigned long base, unsigned long limit)
+{
+    reserve_iommu_exclusion_range(iommu, base, limit);
+    iommu->exclusion_allow_all = IOMMU_CONTROL_ENABLED;
+}
+
+static void __init reserve_unity_map_for_device(
+    u16 bdf, unsigned long base,
+    unsigned long length, u8 iw, u8 ir)
+{
+    unsigned long old_top, new_top;
+
+    /* need to extend unity-mapped range? */
+    if ( ivrs_mappings[bdf].unity_map_enable )
+    {
+        old_top = ivrs_mappings[bdf].addr_range_start +
+            ivrs_mappings[bdf].addr_range_length;
+        new_top = base + length;
+        if ( old_top > new_top )
+            new_top = old_top;
+        if ( ivrs_mappings[bdf].addr_range_start < base )
+            base = ivrs_mappings[bdf].addr_range_start;
+        length = new_top - base;
+    }
+
+    /* extend r/w permissioms and keep aggregate */
+    ivrs_mappings[bdf].write_permission = iw;
+    ivrs_mappings[bdf].read_permission = ir;
+    ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_ENABLED;
+    ivrs_mappings[bdf].addr_range_start = base;
+    ivrs_mappings[bdf].addr_range_length = length;
+}
+
+static int __init register_exclusion_range_for_all_devices(
+    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    unsigned long range_top, iommu_top, length;
+    struct amd_iommu *iommu;
+    u16 bdf;
+
+    /* is part of exclusion range inside of IOMMU virtual address space? */
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    range_top = limit + PAGE_SIZE;
+    iommu_top = max_page * PAGE_SIZE;
+    if ( base < iommu_top )
+    {
+        if ( range_top > iommu_top )
+            range_top = iommu_top;
+        length = range_top - base;
+        /* reserve r/w unity-mapped page entries for devices */
+        /* note: these entries are part of the exclusion range */
+        for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
+            reserve_unity_map_for_device(bdf, base, length, iw, ir);
+        /* push 'base' just outside of virtual address space */
+        base = iommu_top;
+    }
+    /* register IOMMU exclusion range settings */
+    if ( limit >= iommu_top )
+    {
+        for_each_amd_iommu( iommu )
+            reserve_iommu_exclusion_range_all(iommu, base, limit);
+    }
+
+    return 0;
+}
+
+static int __init register_exclusion_range_for_device(
+    u16 bdf, unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    unsigned long range_top, iommu_top, length;
+    struct amd_iommu *iommu;
+    u16 bus, devfn, req;
+
+    bus = bdf >> 8;
+    devfn = bdf & 0xFF;
+    iommu = find_iommu_for_device(bus, devfn);
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("IVMD Error: No IOMMU for Dev_Id 0x%x!\n", bdf);
+        return -ENODEV;
+    }
+    req = ivrs_mappings[bdf].dte_requestor_id;
+
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    range_top = limit + PAGE_SIZE;
+    iommu_top = max_page * PAGE_SIZE;
+    if ( base < iommu_top )
+    {
+        if ( range_top > iommu_top )
+            range_top = iommu_top;
+        length = range_top - base;
+        /* reserve unity-mapped page entries for device */
+        /* note: these entries are part of the exclusion range */
+        reserve_unity_map_for_device(bdf, base, length, iw, ir);
+        reserve_unity_map_for_device(req, base, length, iw, ir);
+
+        /* push 'base' just outside of virtual address space */
+        base = iommu_top;
+    }
+
+    /* register IOMMU exclusion range settings for device */
+    if ( limit >= iommu_top  )
+    {
+        reserve_iommu_exclusion_range(iommu, base, limit);
+        ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_ENABLED;
+        ivrs_mappings[req].dte_allow_exclusion = IOMMU_CONTROL_ENABLED;
+    }
+
+    return 0;
+}
+
+static int __init register_exclusion_range_for_iommu_devices(
+    struct amd_iommu *iommu,
+    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    unsigned long range_top, iommu_top, length;
+    u16 bus, devfn, bdf, req;
+
+    /* is part of exclusion range inside of IOMMU virtual address space? */
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    range_top = limit + PAGE_SIZE;
+    iommu_top = max_page * PAGE_SIZE;
+    if ( base < iommu_top )
+    {
+        if ( range_top > iommu_top )
+            range_top = iommu_top;
+        length = range_top - base;
+        /* reserve r/w unity-mapped page entries for devices */
+        /* note: these entries are part of the exclusion range */
+        for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
+        {
+            bus = bdf >> 8;
+            devfn = bdf & 0xFF;
+            if ( iommu == find_iommu_for_device(bus, devfn) )
+            {
+                reserve_unity_map_for_device(bdf, base, length, iw, ir);
+                req = ivrs_mappings[bdf].dte_requestor_id;
+                reserve_unity_map_for_device(req, base, length, iw, ir);
+            }
+        }
+
+        /* push 'base' just outside of virtual address space */
+        base = iommu_top;
+    }
+
+    /* register IOMMU exclusion range settings */
+    if ( limit >= iommu_top )
+        reserve_iommu_exclusion_range_all(iommu, base, limit);
+    return 0;
+}
+
+static int __init parse_ivmd_device_select(
+    struct acpi_ivmd_block_header *ivmd_block,
+    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    u16 bdf;
+
+    bdf = ivmd_block->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVMD Error: Invalid Dev_Id 0x%x\n", bdf);
+        return -ENODEV;
+    }
+
+    return register_exclusion_range_for_device(bdf, base, limit, iw, ir);
+}
+
+static int __init parse_ivmd_device_range(
+    struct acpi_ivmd_block_header *ivmd_block,
+    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    u16 first_bdf, last_bdf, bdf;
+    int error;
+
+    first_bdf = ivmd_block->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVMD Error: Invalid Range_First Dev_Id 0x%x\n", first_bdf);
+        return -ENODEV;
+    }
+
+    last_bdf = ivmd_block->last_dev_id;
+    if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) )
+    {
+        AMD_IOMMU_DEBUG( 
+            "IVMD Error: Invalid Range_Last Dev_Id 0x%x\n", last_bdf);
+        return -ENODEV;
+    }
+
+    for ( bdf = first_bdf, error = 0; (bdf <= last_bdf) && !error; bdf++ )
+        error = register_exclusion_range_for_device(
+            bdf, base, limit, iw, ir);
+
+    return error;
+}
+
+static int __init parse_ivmd_device_iommu(
+    struct acpi_ivmd_block_header *ivmd_block,
+    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+{
+    struct amd_iommu *iommu;
+
+    /* find target IOMMU */
+    iommu = find_iommu_from_bdf_cap(ivmd_block->header.dev_id,
+                                    ivmd_block->cap_offset);
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("IVMD Error: No IOMMU for Dev_Id 0x%x  Cap 0x%x\n",
+                        ivmd_block->header.dev_id, ivmd_block->cap_offset);
+        return -ENODEV;
+    }
+
+    return register_exclusion_range_for_iommu_devices(
+        iommu, base, limit, iw, ir);
+}
+
+static int __init parse_ivmd_block(struct acpi_ivmd_block_header *ivmd_block)
+{
+    unsigned long start_addr, mem_length, base, limit;
+    u8 iw, ir;
+
+    if ( ivmd_block->header.length <
+         sizeof(struct acpi_ivmd_block_header) )
+    {
+        AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Length!\n");
+        return -ENODEV;
+    }
+
+    start_addr = (unsigned long)ivmd_block->start_addr;
+    mem_length = (unsigned long)ivmd_block->mem_length;
+    base = start_addr & PAGE_MASK;
+    limit = (start_addr + mem_length - 1) & PAGE_MASK;
+
+    AMD_IOMMU_DEBUG("IVMD Block: Type 0x%x\n",ivmd_block->header.type);
+    AMD_IOMMU_DEBUG(" Start_Addr_Phys 0x%lx\n", start_addr);
+    AMD_IOMMU_DEBUG(" Mem_Length 0x%lx\n", mem_length);
+
+    if ( get_field_from_byte(ivmd_block->header.flags,
+                             AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK,
+                             AMD_IOMMU_ACPI_EXCLUSION_RANGE_SHIFT) )
+        iw = ir = IOMMU_CONTROL_ENABLED;
+    else if ( get_field_from_byte(ivmd_block->header.flags,
+                                  AMD_IOMMU_ACPI_UNITY_MAPPING_MASK,
+                                  AMD_IOMMU_ACPI_UNITY_MAPPING_SHIFT) )
+    {
+        iw = get_field_from_byte(ivmd_block->header.flags,
+                                 AMD_IOMMU_ACPI_IW_PERMISSION_MASK,
+                                 AMD_IOMMU_ACPI_IW_PERMISSION_SHIFT);
+        ir = get_field_from_byte(ivmd_block->header.flags,
+                                 AMD_IOMMU_ACPI_IR_PERMISSION_MASK,
+                                 AMD_IOMMU_ACPI_IR_PERMISSION_SHIFT);
+    }
+    else
+    {
+        AMD_IOMMU_DEBUG("IVMD Error: Invalid Flag Field!\n");
+        return -ENODEV;
+    }
+
+    switch( ivmd_block->header.type )
+    {
+    case AMD_IOMMU_ACPI_IVMD_ALL_TYPE:
+        return register_exclusion_range_for_all_devices(
+            base, limit, iw, ir);
+
+    case AMD_IOMMU_ACPI_IVMD_ONE_TYPE:
+        return parse_ivmd_device_select(ivmd_block,
+                                        base, limit, iw, ir);
+
+    case AMD_IOMMU_ACPI_IVMD_RANGE_TYPE:
+        return parse_ivmd_device_range(ivmd_block,
+                                       base, limit, iw, ir);
+
+    case AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE:
+        return parse_ivmd_device_iommu(ivmd_block,
+                                       base, limit, iw, ir);
+
+    default:
+        AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Type!\n");
+        return -ENODEV;
+    }
+}
+
+static u16 __init parse_ivhd_device_padding(
+    u16 pad_length, u16 header_length, u16 block_length)
+{
+    if ( header_length < (block_length + pad_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    return pad_length;
+}
+
+static u16 __init parse_ivhd_device_select(
+    union acpi_ivhd_device *ivhd_device, struct amd_iommu *iommu)
+{
+    u16 bdf;
+
+    bdf = ivhd_device->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+
+    /* override flags for device */
+    add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu);
+
+    return sizeof(struct acpi_ivhd_device_header);
+}
+
+static u16 __init parse_ivhd_device_range(
+    union acpi_ivhd_device *ivhd_device,
+    u16 header_length, u16 block_length, struct amd_iommu *iommu)
+{
+    u16 dev_length, first_bdf, last_bdf, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_range);
+    if ( header_length < (block_length + dev_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    if ( ivhd_device->range.trailer.type !=
+         AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: "
+                        "Invalid Range: End_Type 0x%x\n",
+                        ivhd_device->range.trailer.type);
+        return 0;
+    }
+
+    first_bdf = ivhd_device->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVHD Error: Invalid Range: First Dev_Id 0x%x\n", first_bdf);
+        return 0;
+    }
+
+    last_bdf = ivhd_device->range.trailer.dev_id;
+    if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVHD Error: Invalid Range: Last Dev_Id 0x%x\n", last_bdf);
+        return 0;
+    }
+
+    AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", first_bdf, last_bdf);
+
+    for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
+        add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu);
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_alias(
+    union acpi_ivhd_device *ivhd_device,
+    u16 header_length, u16 block_length, struct amd_iommu *iommu)
+{
+    u16 dev_length, alias_id, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_alias);
+    if ( header_length < (block_length + dev_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    bdf = ivhd_device->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+
+    alias_id = ivhd_device->alias.dev_id;
+    if ( alias_id >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Alias Dev_Id 0x%x\n", alias_id);
+        return 0;
+    }
+
+    AMD_IOMMU_DEBUG(" Dev_Id Alias: 0x%x\n", alias_id);
+
+    /* override requestor_id and flags for device */
+    add_ivrs_mapping_entry(bdf, alias_id, ivhd_device->header.flags, iommu);
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_alias_range(
+    union acpi_ivhd_device *ivhd_device,
+    u16 header_length, u16 block_length, struct amd_iommu *iommu)
+{
+
+    u16 dev_length, first_bdf, last_bdf, alias_id, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_alias_range);
+    if ( header_length < (block_length + dev_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    if ( ivhd_device->alias_range.trailer.type !=
+         AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: "
+                        "Invalid Range: End_Type 0x%x\n",
+                        ivhd_device->alias_range.trailer.type);
+        return 0;
+    }
+
+    first_bdf = ivhd_device->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVHD Error: Invalid Range: First Dev_Id 0x%x\n", first_bdf);
+        return 0;
+    }
+
+    last_bdf = ivhd_device->alias_range.trailer.dev_id;
+    if ( last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVHD Error: Invalid Range: Last Dev_Id 0x%x\n", last_bdf);
+        return 0;
+    }
+
+    alias_id = ivhd_device->alias_range.alias.dev_id;
+    if ( alias_id >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Alias Dev_Id 0x%x\n", alias_id);
+        return 0;
+    }
+
+    AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", first_bdf, last_bdf);
+    AMD_IOMMU_DEBUG(" Dev_Id Alias: 0x%x\n", alias_id);
+
+    /* override requestor_id and flags for range of devices */
+    for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
+        add_ivrs_mapping_entry(bdf, alias_id, ivhd_device->header.flags, 
+                               iommu);
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_extended(
+    union acpi_ivhd_device *ivhd_device,
+    u16 header_length, u16 block_length, struct amd_iommu *iommu)
+{
+    u16 dev_length, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_extended);
+    if ( header_length < (block_length + dev_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    bdf = ivhd_device->header.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+
+    /* override flags for device */
+    add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu);
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_extended_range(
+    union acpi_ivhd_device *ivhd_device,
+    u16 header_length, u16 block_length, struct amd_iommu *iommu)
+{
+    u16 dev_length, first_bdf, last_bdf, bdf;
+
+    dev_length = sizeof(struct acpi_ivhd_device_extended_range);
+    if ( header_length < (block_length + dev_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+
+    if ( ivhd_device->extended_range.trailer.type !=
+         AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: "
+                        "Invalid Range: End_Type 0x%x\n",
+                        ivhd_device->extended_range.trailer.type);
+        return 0;
+    }
+
+    first_bdf = ivhd_device->header.dev_id;
+    if ( first_bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVHD Error: Invalid Range: First Dev_Id 0x%x\n", first_bdf);
+        return 0;
+    }
+
+    last_bdf = ivhd_device->extended_range.trailer.dev_id;
+    if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) )
+    {
+        AMD_IOMMU_DEBUG(
+            "IVHD Error: Invalid Range: Last Dev_Id 0x%x\n", last_bdf);
+        return 0;
+    }
+
+    AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n",
+                    first_bdf, last_bdf);
+
+    /* override flags for range of devices */
+    for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
+        add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu);
+
+    return dev_length;
+}
+
+static u16 __init parse_ivhd_device_special(
+                  union acpi_ivhd_device *ivhd_device,
+                  u16 header_length, u16 block_length, struct amd_iommu *iommu)
+{
+    u16 dev_length, bdf;
+    
+    dev_length = sizeof(struct acpi_ivhd_device_special);
+    if ( header_length < (block_length + dev_length) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n");
+        return 0;
+    }
+    
+    bdf = ivhd_device->special.dev_id;
+    if ( bdf >= ivrs_bdf_entries )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id 0x%x\n", bdf);
+        return 0;
+    }
+    
+    add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu);
+    /* set device id of ioapic */
+    ioapic_bdf[ivhd_device->special.handle] = bdf;
+    return dev_length;
+ }
+ 
+
+static int __init parse_ivhd_block(struct acpi_ivhd_block_header *ivhd_block)
+{
+    union acpi_ivhd_device *ivhd_device;
+    u16 block_length, dev_length;
+    struct amd_iommu *iommu;
+
+    if ( ivhd_block->header.length <
+         sizeof(struct acpi_ivhd_block_header) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Block Length!\n");
+        return -ENODEV;
+    }
+
+    iommu = find_iommu_from_bdf_cap(ivhd_block->header.dev_id,
+                                    ivhd_block->cap_offset);
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: No IOMMU for Dev_Id 0x%x  Cap 0x%x\n",
+                        ivhd_block->header.dev_id, ivhd_block->cap_offset);
+        return -ENODEV;
+    }
+
+    /* parse Device Entries */
+    block_length = sizeof(struct acpi_ivhd_block_header);
+    while ( ivhd_block->header.length >=
+            (block_length + sizeof(struct acpi_ivhd_device_header)) )
+    {
+        ivhd_device = (union acpi_ivhd_device *)
+            ((u8 *)ivhd_block + block_length);
+
+        AMD_IOMMU_DEBUG( "IVHD Device Entry:\n");
+        AMD_IOMMU_DEBUG( " Type 0x%x\n", ivhd_device->header.type);
+        AMD_IOMMU_DEBUG( " Dev_Id 0x%x\n", ivhd_device->header.dev_id);
+        AMD_IOMMU_DEBUG( " Flags 0x%x\n", ivhd_device->header.flags);
+
+        switch ( ivhd_device->header.type )
+        {
+        case AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD:
+            dev_length = parse_ivhd_device_padding(
+                sizeof(u32),
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD:
+            dev_length = parse_ivhd_device_padding(
+                sizeof(u64),
+                ivhd_block->header.length, block_length);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_SELECT:
+            dev_length = parse_ivhd_device_select(ivhd_device, iommu);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START:
+            dev_length = parse_ivhd_device_range(
+                ivhd_device,
+                ivhd_block->header.length, block_length, iommu);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT:
+            dev_length = parse_ivhd_device_alias(
+                ivhd_device,
+                ivhd_block->header.length, block_length, iommu);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE:
+            dev_length = parse_ivhd_device_alias_range(
+                ivhd_device,
+                ivhd_block->header.length, block_length, iommu);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT:
+            dev_length = parse_ivhd_device_extended(
+                ivhd_device,
+                ivhd_block->header.length, block_length, iommu);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE:
+            dev_length = parse_ivhd_device_extended_range(
+                ivhd_device,
+                ivhd_block->header.length, block_length, iommu);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_SPECIAL:
+            dev_length = parse_ivhd_device_special(
+                ivhd_device,
+                ivhd_block->header.length, block_length, iommu);
+            break;
+        default:
+            AMD_IOMMU_DEBUG("IVHD Error: Invalid Device Type!\n");
+            dev_length = 0;
+            break;
+        }
+
+        block_length += dev_length;
+        if ( !dev_length )
+            return -ENODEV;
+    }
+
+    return 0;
+}
+
+static int __init parse_ivrs_block(struct acpi_ivrs_block_header *ivrs_block)
+{
+    struct acpi_ivhd_block_header *ivhd_block;
+    struct acpi_ivmd_block_header *ivmd_block;
+
+    switch ( ivrs_block->type )
+    {
+    case AMD_IOMMU_ACPI_IVHD_TYPE:
+        ivhd_block = (struct acpi_ivhd_block_header *)ivrs_block;
+        return parse_ivhd_block(ivhd_block);
+
+    case AMD_IOMMU_ACPI_IVMD_ALL_TYPE:
+    case AMD_IOMMU_ACPI_IVMD_ONE_TYPE:
+    case AMD_IOMMU_ACPI_IVMD_RANGE_TYPE:
+    case AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE:
+        ivmd_block = (struct acpi_ivmd_block_header *)ivrs_block;
+        return parse_ivmd_block(ivmd_block);
+
+    default:
+        AMD_IOMMU_DEBUG("IVRS Error: Invalid Block Type!\n");
+        return -ENODEV;
+    }
+
+    return 0;
+}
+
+static void __init dump_acpi_table_header(struct acpi_table_header *table)
+{
+    int i;
+
+    AMD_IOMMU_DEBUG("ACPI Table:\n");
+    AMD_IOMMU_DEBUG(" Signature ");
+    for ( i = 0; i < ACPI_NAME_SIZE; i++ )
+        printk("%c", table->signature[i]);
+    printk("\n");
+
+    AMD_IOMMU_DEBUG(" Length 0x%x\n", table->length);
+    AMD_IOMMU_DEBUG(" Revision 0x%x\n", table->revision);
+    AMD_IOMMU_DEBUG(" CheckSum 0x%x\n", table->checksum);
+
+    AMD_IOMMU_DEBUG(" OEM_Id ");
+    for ( i = 0; i < ACPI_OEM_ID_SIZE; i++ )
+        printk("%c", table->oem_id[i]);
+    printk("\n");
+
+    AMD_IOMMU_DEBUG(" OEM_Table_Id ");
+    for ( i = 0; i < ACPI_OEM_TABLE_ID_SIZE; i++ )
+        printk("%c", table->oem_table_id[i]);
+    printk("\n");
+
+    AMD_IOMMU_DEBUG(" OEM_Revision 0x%x\n", table->oem_revision);
+
+    AMD_IOMMU_DEBUG(" Creator_Id ");
+    for ( i = 0; i < ACPI_NAME_SIZE; i++ )
+        printk("%c", table->asl_compiler_id[i]);
+    printk("\n");
+
+    AMD_IOMMU_DEBUG(" Creator_Revision 0x%x\n",
+                    table->asl_compiler_revision);
+}
+
+static int __init parse_ivrs_table(unsigned long phys_addr,
+                                    unsigned long size)
+{
+    struct acpi_ivrs_block_header *ivrs_block;
+    unsigned long length;
+    int error = 0;
+    struct acpi_table_header *table;
+
+    table = (struct acpi_table_header *)__acpi_map_table(phys_addr, size);
+    if ( !table )
+    {
+        AMD_IOMMU_DEBUG("IVRS Error: Unable to map IVRS\n");
+        return -ENODEV;
+    }
+
+    if ( amd_iommu_debug )
+        dump_acpi_table_header(table);
+
+    /* parse IVRS blocks */
+    length = sizeof(struct acpi_ivrs_table_header);
+    while ( (error == 0) && (table->length > (length + sizeof(*ivrs_block))) )
+    {
+        ivrs_block = (struct acpi_ivrs_block_header *)
+            ((u8 *)table + length);
+
+        AMD_IOMMU_DEBUG("IVRS Block:\n");
+        AMD_IOMMU_DEBUG(" Type 0x%x\n", ivrs_block->type);
+        AMD_IOMMU_DEBUG(" Flags 0x%x\n", ivrs_block->flags);
+        AMD_IOMMU_DEBUG(" Length 0x%x\n", ivrs_block->length);
+        AMD_IOMMU_DEBUG(" Dev_Id 0x%x\n", ivrs_block->dev_id);
+
+        if ( table->length < (length + ivrs_block->length) )
+        {
+            AMD_IOMMU_DEBUG("IVRS Error: "
+                            "Table Length Exceeded: 0x%x -> 0x%lx\n",
+                            table->length,
+                            (length + ivrs_block->length));
+            return -ENODEV;
+        }
+
+        error = parse_ivrs_block(ivrs_block);
+        length += ivrs_block->length;
+    }
+
+    /* this will be used in amd_iommu_update_ivrs_mapping_acpi() */
+    parse_ivrs_table_error = error;    
+    return error;
+}
+
+static int __init detect_iommu_acpi(unsigned long phys_addr,
+                                    unsigned long size)
+{
+    struct acpi_ivrs_block_header *ivrs_block;
+    struct acpi_table_header *table;
+    unsigned long i;
+    unsigned long length = sizeof(struct acpi_ivrs_table_header);
+    u8 checksum, *raw_table;
+
+    table = (struct acpi_table_header *)__acpi_map_table(phys_addr, size);
+    if ( !table )
+    {
+        AMD_IOMMU_DEBUG("IVRS Error: Unable to map IVRS\n");
+        return -ENODEV;
+    }
+
+    /* validate checksum: sum of entire table == 0 */
+    checksum = 0;
+    raw_table = (u8 *)table;
+    for ( i = 0; i < table->length; i++ )
+        checksum += raw_table[i];
+    if ( checksum )
+    {
+        AMD_IOMMU_DEBUG("IVRS Error: "
+                        "Invalid Checksum 0x%x\n", checksum);
+        return -ENODEV;
+    }
+
+    while ( table->length > (length + sizeof(*ivrs_block)) )
+    {
+        ivrs_block = (struct acpi_ivrs_block_header *) ((u8 *)table + length);
+        if ( table->length < (length + ivrs_block->length) )
+            return -ENODEV;
+        if ( ivrs_block->type == AMD_IOMMU_ACPI_IVHD_TYPE )
+            if ( amd_iommu_detect_one_acpi((void*)ivrs_block) != 0 )
+                return -ENODEV;
+        length += ivrs_block->length;
+    }
+    return 0;
+}
+
+#define UPDATE_LAST_BDF(x) do {\
+   if ((x) > last_bdf) \
+       last_bdf = (x); \
+   } while(0);
+
+static int __init get_last_bdf_ivhd(void *ivhd)
+{
+    union acpi_ivhd_device *ivhd_device;
+    u16 block_length, dev_length;
+    struct acpi_ivhd_block_header *ivhd_block;
+
+    ivhd_block = (struct acpi_ivhd_block_header *)ivhd;
+
+    if ( ivhd_block->header.length <
+         sizeof(struct acpi_ivhd_block_header) )
+    {
+        AMD_IOMMU_DEBUG("IVHD Error: Invalid Block Length!\n");
+        return -ENODEV;
+    }
+
+    block_length = sizeof(struct acpi_ivhd_block_header);
+    while ( ivhd_block->header.length >=
+            (block_length + sizeof(struct acpi_ivhd_device_header)) )
+    {
+        ivhd_device = (union acpi_ivhd_device *)
+            ((u8 *)ivhd_block + block_length);
+
+        switch ( ivhd_device->header.type )
+        {
+        case AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD:
+            dev_length = sizeof(u32);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD:
+            dev_length = sizeof(u64);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_SELECT:
+            UPDATE_LAST_BDF(ivhd_device->header.dev_id);
+            dev_length = sizeof(struct acpi_ivhd_device_header);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT:
+            UPDATE_LAST_BDF(ivhd_device->header.dev_id);
+            dev_length = sizeof(struct acpi_ivhd_device_alias);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT:
+            UPDATE_LAST_BDF(ivhd_device->header.dev_id);
+            dev_length = sizeof(struct acpi_ivhd_device_extended);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START:
+            UPDATE_LAST_BDF(ivhd_device->range.trailer.dev_id);
+            dev_length = sizeof(struct acpi_ivhd_device_range);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE:
+            UPDATE_LAST_BDF(ivhd_device->alias_range.trailer.dev_id)
+            dev_length = sizeof(struct acpi_ivhd_device_alias_range);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE:
+            UPDATE_LAST_BDF(ivhd_device->extended_range.trailer.dev_id)
+            dev_length = sizeof(struct acpi_ivhd_device_extended_range);
+            break;
+        case AMD_IOMMU_ACPI_IVHD_DEV_SPECIAL:
+            UPDATE_LAST_BDF(ivhd_device->special.dev_id)
+            dev_length = sizeof(struct acpi_ivhd_device_special);
+            break;
+        default:
+            AMD_IOMMU_DEBUG("IVHD Error: Invalid Device Type!\n");
+            dev_length = 0;
+            break;
+        }
+
+        block_length += dev_length;
+        if ( !dev_length )
+            return -ENODEV;
+    }
+
+    return 0;
+}
+
+
+static int __init get_last_bdf_acpi(unsigned long phys_addr, unsigned long size)
+{
+    struct acpi_ivrs_block_header *ivrs_block;
+    struct acpi_table_header *table;
+    unsigned long length = sizeof(struct acpi_ivrs_table_header);
+
+    table = (struct acpi_table_header *)__acpi_map_table(phys_addr, size);
+    if ( !table )
+    {
+        AMD_IOMMU_DEBUG("IVRS Error: Unable to map IVRS\n");
+        return -ENODEV;
+    }
+
+    while ( table->length > (length + sizeof(*ivrs_block)) )
+    {
+        ivrs_block = (struct acpi_ivrs_block_header *) ((u8 *)table + length);
+        if ( table->length < (length + ivrs_block->length) )
+            return -ENODEV;
+        if ( ivrs_block->type == AMD_IOMMU_ACPI_IVHD_TYPE )
+            if ( get_last_bdf_ivhd((void*)ivrs_block) != 0 )
+                return -ENODEV;
+        length += ivrs_block->length;
+    }
+   return 0;
+}
+
+int __init amd_iommu_detect_acpi(void)
+{
+    return acpi_table_parse(ACPI_IVRS, detect_iommu_acpi);
+}
+
+int __init amd_iommu_get_ivrs_dev_entries(void)
+{
+    acpi_table_parse(ACPI_IVRS, get_last_bdf_acpi);
+    return last_bdf + 1;
+}
+
+int __init amd_iommu_update_ivrs_mapping_acpi(void)
+{
+    /* note that acpi_table_parse() function doesn't return value from
+     * parse_ivrs_table(). So we have to get the value from a global variable
+     * parse_ivrs_table_error. 
+     */
+    acpi_table_parse(ACPI_IVRS, parse_ivrs_table);
+
+    return parse_ivrs_table_error;
+}
diff -Naurp xen/drivers/passthrough/amd/iommu_detect.c xen-redhat/drivers/passthrough/amd/iommu_detect.c
--- xen/drivers/passthrough/amd/iommu_detect.c
+++ xen-redhat/drivers/passthrough/amd/iommu_detect.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/iommu.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+#include <asm/hvm/svm/amd-iommu-acpi.h>
+
+extern struct list_head amd_iommu_head;
+unsigned short last_bdf = 0;
+
+static int __init get_iommu_msi_capabilities(u8 bus, u8 dev, u8 func,
+            struct amd_iommu *iommu)
+{
+    int cap_ptr, cap_id;
+    u32 cap_header;
+    u16 control;
+    int count = 0;
+
+    cap_ptr = pci_conf_read8(bus, dev, func,
+            PCI_CAPABILITY_LIST);
+
+    while ( cap_ptr >= PCI_MIN_CAP_OFFSET &&
+        count < PCI_MAX_CAP_BLOCKS )
+    {
+        cap_ptr &= PCI_CAP_PTR_MASK;
+        cap_header = pci_conf_read32(bus, dev, func, cap_ptr);
+        cap_id = get_field_from_reg_u32(cap_header,
+                PCI_CAP_ID_MASK, PCI_CAP_ID_SHIFT);
+
+        if ( cap_id == PCI_CAP_ID_MSI )
+        {
+            iommu->msi_cap = cap_ptr;
+            break;
+        }
+        cap_ptr = get_field_from_reg_u32(cap_header,
+                PCI_CAP_NEXT_PTR_MASK, PCI_CAP_NEXT_PTR_SHIFT);
+        count++;
+    }
+
+    if ( !iommu->msi_cap )
+        return -ENODEV;
+
+    AMD_IOMMU_DEBUG("Found MSI capability block \n");
+    control = pci_conf_read16(bus, dev, func,
+            iommu->msi_cap + PCI_MSI_FLAGS);
+    iommu->maskbit = control & PCI_MSI_FLAGS_MASKBIT;
+    return 0;
+}
+
+int __init get_iommu_capabilities(u8 bus, u8 dev, u8 func, u8 cap_ptr,
+                                  struct amd_iommu *iommu)
+{
+    u32 cap_header, cap_range, misc_info;
+
+    cap_header = pci_conf_read32(bus, dev, func, cap_ptr);
+    iommu->revision = get_field_from_reg_u32(
+        cap_header, PCI_CAP_REV_MASK, PCI_CAP_REV_SHIFT);
+    iommu->pte_not_present_cached = get_field_from_reg_u32(
+        cap_header, PCI_CAP_NP_CACHE_MASK, PCI_CAP_NP_CACHE_SHIFT);
+
+    cap_range = pci_conf_read32(bus, dev, func,
+                                cap_ptr + PCI_CAP_RANGE_OFFSET);
+    iommu->unit_id = get_field_from_reg_u32(
+        cap_range, PCI_CAP_UNIT_ID_MASK, PCI_CAP_UNIT_ID_SHIFT);
+
+    misc_info = pci_conf_read32(bus, dev, func,
+                                cap_ptr + PCI_MISC_INFO_OFFSET);
+    iommu->msi_number = get_field_from_reg_u32(
+        misc_info, PCI_CAP_MSI_NUMBER_MASK, PCI_CAP_MSI_NUMBER_SHIFT);
+
+    return 0;
+}
+
+int __init amd_iommu_detect_one_acpi(void *ivhd)
+{
+    struct amd_iommu *iommu;
+    u8 bus, dev, func;
+    struct acpi_ivhd_block_header *ivhd_block;
+
+    ivhd_block = (struct acpi_ivhd_block_header *)ivhd;
+
+    if ( ivhd_block->header.length < sizeof(struct acpi_ivhd_block_header) )
+    {
+        AMD_IOMMU_DEBUG("Invalid IVHD Block Length!\n");
+        return -ENODEV;
+    }
+
+    if ( !ivhd_block->header.dev_id ||
+        !ivhd_block->cap_offset || !ivhd_block->mmio_base)
+    {
+        AMD_IOMMU_DEBUG("Invalid IVHD Block!\n");
+        return -ENODEV;
+    }
+
+    iommu = (struct amd_iommu *) xmalloc(struct amd_iommu);
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("Error allocating amd_iommu\n");
+        return -ENOMEM;
+    }
+    memset(iommu, 0, sizeof(struct amd_iommu));
+
+    spin_lock_init(&iommu->lock);
+
+    iommu->bdf = ivhd_block->header.dev_id;
+    iommu->cap_offset = ivhd_block->cap_offset;
+    iommu->mmio_base_phys = ivhd_block->mmio_base;
+
+    /* override IOMMU support flags */
+    iommu->coherent = get_field_from_byte(ivhd_block->header.flags,
+                        AMD_IOMMU_ACPI_COHERENT_MASK,
+                        AMD_IOMMU_ACPI_COHERENT_SHIFT);
+    iommu->iotlb_support = get_field_from_byte(ivhd_block->header.flags,
+                        AMD_IOMMU_ACPI_IOTLB_SUP_MASK,
+                        AMD_IOMMU_ACPI_IOTLB_SUP_SHIFT);
+    iommu->isochronous = get_field_from_byte(ivhd_block->header.flags,
+                        AMD_IOMMU_ACPI_ISOC_MASK,
+                        AMD_IOMMU_ACPI_ISOC_SHIFT);
+    iommu->res_pass_pw = get_field_from_byte(ivhd_block->header.flags,
+                        AMD_IOMMU_ACPI_RES_PASS_PW_MASK,
+                        AMD_IOMMU_ACPI_RES_PASS_PW_SHIFT);
+    iommu->pass_pw = get_field_from_byte(ivhd_block->header.flags,
+                        AMD_IOMMU_ACPI_PASS_PW_MASK,
+                        AMD_IOMMU_ACPI_PASS_PW_SHIFT);
+    iommu->ht_tunnel_enable = get_field_from_byte(ivhd_block->header.flags,
+                        AMD_IOMMU_ACPI_HT_TUN_ENB_MASK,
+                        AMD_IOMMU_ACPI_HT_TUN_ENB_SHIFT);
+    bus = iommu->bdf >> 8;
+    dev = PCI_SLOT(iommu->bdf & 0xFF);
+    func = PCI_FUNC(iommu->bdf & 0xFF);
+    get_iommu_capabilities(bus, dev, func, iommu->cap_offset, iommu);
+    get_iommu_msi_capabilities(bus, dev, func, iommu);
+
+    list_add_tail(&iommu->list, &amd_iommu_head);
+
+    return 0;
+}
diff -Naurp xen/drivers/passthrough/amd/iommu_init.c xen-redhat/drivers/passthrough/amd/iommu_init.c
--- xen/drivers/passthrough/amd/iommu_init.c
+++ xen-redhat/drivers/passthrough/amd/iommu_init.c
@@ -0,0 +1,839 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+#include <asm-x86/fixmap.h>
+
+static struct amd_iommu *vector_to_iommu[NR_VECTORS];
+static int nr_amd_iommus;
+static long amd_iommu_cmd_buffer_entries = IOMMU_CMD_BUFFER_DEFAULT_ENTRIES;
+static long amd_iommu_event_log_entries = IOMMU_EVENT_LOG_DEFAULT_ENTRIES;
+
+unsigned short ivrs_bdf_entries;
+struct ivrs_mappings *ivrs_mappings;
+struct list_head amd_iommu_head;
+struct table_struct device_table;
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT		0
+#define  MSI_DATA_VECTOR_MASK		0x000000ff
+#define	 MSI_DATA_VECTOR(v)		(((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT	8
+#define  MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define  MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT		14
+#define	 MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
+#define	 MSI_DATA_LEVEL_ASSERT		(1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT		15
+#define  MSI_DATA_TRIGGER_EDGE		(0 << MSI_DATA_TRIGGER_SHIFT)
+#define  MSI_DATA_TRIGGER_LEVEL		(1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI	    	0
+#define MSI_ADDR_BASE_LO	    	0xfee00000
+#define MSI_ADDR_HEADER             MSI_ADDR_BASE_LO
+
+#define MSI_ADDR_DESTMODE_SHIFT     2
+#define MSI_ADDR_DESTMODE_PHYS      (0 << MSI_ADDR_DESTMODE_SHIFT)
+#define MSI_ADDR_DESTMODE_LOGIC     (1 << MSI_ADDR_DESTMODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT  3
+#define MSI_ADDR_REDIRECTION_CPU    (0 << MSI_ADDR_REDIRECTION_SHIFT)
+#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+
+#define MSI_ADDR_DEST_ID_SHIFT		12
+#define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
+#define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK)
+
+static int __init map_iommu_mmio_region(struct amd_iommu *iommu)
+{
+    unsigned long mfn;
+
+    if ( nr_amd_iommus > MAX_AMD_IOMMUS )
+    {
+        AMD_IOMMU_DEBUG("nr_amd_iommus %d > MAX_IOMMUS\n", nr_amd_iommus);
+        return -ENOMEM;
+    }
+
+    iommu->mmio_base = (void *)fix_to_virt(
+        FIX_IOMMU_MMIO_BASE_0 + nr_amd_iommus * MMIO_PAGES_PER_IOMMU);
+    mfn = (unsigned long)(iommu->mmio_base_phys >> PAGE_SHIFT);
+    map_pages_to_xen((unsigned long)iommu->mmio_base, mfn,
+                     MMIO_PAGES_PER_IOMMU, PAGE_HYPERVISOR_NOCACHE);
+
+    memset(iommu->mmio_base, 0, IOMMU_MMIO_REGION_LENGTH);
+
+    return 0;
+}
+
+static void __init unmap_iommu_mmio_region(struct amd_iommu *iommu)
+{
+    if ( iommu->mmio_base )
+    {
+        iounmap(iommu->mmio_base);
+        iommu->mmio_base = NULL;
+    }
+}
+
+static void __init register_iommu_dev_table_in_mmio_space(struct amd_iommu *iommu)
+{
+    u64 addr_64, addr_lo, addr_hi;
+    u32 entry;
+
+    addr_64 = (u64)virt_to_maddr(iommu->dev_table.buffer);
+    addr_lo = addr_64 & DMA_32BIT_MASK;
+    addr_hi = addr_64 >> 32;
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_DEV_TABLE_BASE_LOW_MASK,
+                         IOMMU_DEV_TABLE_BASE_LOW_SHIFT, &entry);
+    set_field_in_reg_u32((iommu->dev_table.alloc_size / PAGE_SIZE) - 1,
+                         entry, IOMMU_DEV_TABLE_SIZE_MASK,
+                         IOMMU_DEV_TABLE_SIZE_SHIFT, &entry);
+    writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_LOW_OFFSET);
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_DEV_TABLE_BASE_HIGH_MASK,
+                         IOMMU_DEV_TABLE_BASE_HIGH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_HIGH_OFFSET);
+}
+
+static void __init register_iommu_cmd_buffer_in_mmio_space(struct amd_iommu *iommu)
+{
+    u64 addr_64, addr_lo, addr_hi;
+    u32 power_of2_entries;
+    u32 entry;
+
+    addr_64 = (u64)virt_to_maddr(iommu->cmd_buffer.buffer);
+    addr_lo = addr_64 & DMA_32BIT_MASK;
+    addr_hi = addr_64 >> 32;
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_CMD_BUFFER_BASE_LOW_MASK,
+                         IOMMU_CMD_BUFFER_BASE_LOW_SHIFT, &entry);
+    writel(entry, iommu->mmio_base + IOMMU_CMD_BUFFER_BASE_LOW_OFFSET);
+
+    power_of2_entries = get_order_from_bytes(iommu->cmd_buffer.alloc_size) +
+        IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_CMD_BUFFER_BASE_HIGH_MASK,
+                         IOMMU_CMD_BUFFER_BASE_HIGH_SHIFT, &entry);
+    set_field_in_reg_u32(power_of2_entries, entry,
+                         IOMMU_CMD_BUFFER_LENGTH_MASK,
+                         IOMMU_CMD_BUFFER_LENGTH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET);
+}
+
+static void __init register_iommu_event_log_in_mmio_space(struct amd_iommu *iommu)
+{
+    u64 addr_64, addr_lo, addr_hi;
+    u32 power_of2_entries;
+    u32 entry;
+
+    addr_64 = (u64)virt_to_maddr(iommu->event_log.buffer);
+    addr_lo = addr_64 & DMA_32BIT_MASK;
+    addr_hi = addr_64 >> 32;
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_EVENT_LOG_BASE_LOW_MASK,
+                         IOMMU_EVENT_LOG_BASE_LOW_SHIFT, &entry);
+    writel(entry, iommu->mmio_base + IOMMU_EVENT_LOG_BASE_LOW_OFFSET);
+
+    power_of2_entries = get_order_from_bytes(iommu->event_log.alloc_size) +
+                        IOMMU_EVENT_LOG_POWER_OF2_ENTRIES_PER_PAGE;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                        IOMMU_EVENT_LOG_BASE_HIGH_MASK,
+                        IOMMU_EVENT_LOG_BASE_HIGH_SHIFT, &entry);
+    set_field_in_reg_u32(power_of2_entries, entry,
+                        IOMMU_EVENT_LOG_LENGTH_MASK,
+                        IOMMU_EVENT_LOG_LENGTH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EVENT_LOG_BASE_HIGH_OFFSET);
+}
+
+static void __init set_iommu_translation_control(struct amd_iommu *iommu,
+                                                 int enable)
+{
+    u32 entry;
+
+    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+
+    if ( enable )
+    {
+        set_field_in_reg_u32(iommu->ht_tunnel_support ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK,
+                         IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT, &entry);
+        set_field_in_reg_u32(iommu->isochronous ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_ISOCHRONOUS_MASK,
+                         IOMMU_CONTROL_ISOCHRONOUS_SHIFT, &entry);
+        set_field_in_reg_u32(iommu->coherent ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_COHERENT_MASK,
+                         IOMMU_CONTROL_COHERENT_SHIFT, &entry);
+        set_field_in_reg_u32(iommu->res_pass_pw ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK,
+                         IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT, &entry);
+        /* do not set PassPW bit */
+        set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_PASS_POSTED_WRITE_MASK,
+                         IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT, &entry);
+    }
+    set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_TRANSLATION_ENABLE_MASK,
+                         IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+}
+
+static void __init set_iommu_command_buffer_control(struct amd_iommu *iommu,
+                                                    int enable)
+{
+    u32 entry;
+
+    entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+    set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK,
+                         IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+
+    /*reset head and tail pointer */
+    writel(0x0, iommu->mmio_base + IOMMU_CMD_BUFFER_HEAD_OFFSET);
+    writel(0x0, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET);
+}
+
+static void __init register_iommu_exclusion_range(struct amd_iommu *iommu)
+{
+    u64 addr_lo, addr_hi;
+    u32 entry;
+
+    addr_lo = iommu->exclusion_limit & DMA_32BIT_MASK;
+    addr_hi = iommu->exclusion_limit >> 32;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_EXCLUSION_LIMIT_HIGH_MASK,
+                         IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET);
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_EXCLUSION_LIMIT_LOW_MASK,
+                         IOMMU_EXCLUSION_LIMIT_LOW_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_LOW_OFFSET);
+
+    addr_lo = iommu->exclusion_base & DMA_32BIT_MASK;
+    addr_hi = iommu->exclusion_base >> 32;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_EXCLUSION_BASE_HIGH_MASK,
+                         IOMMU_EXCLUSION_BASE_HIGH_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_HIGH_OFFSET);
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_EXCLUSION_BASE_LOW_MASK,
+                         IOMMU_EXCLUSION_BASE_LOW_SHIFT, &entry);
+
+    set_field_in_reg_u32(iommu->exclusion_allow_all, entry,
+                         IOMMU_EXCLUSION_ALLOW_ALL_MASK,
+                         IOMMU_EXCLUSION_ALLOW_ALL_SHIFT, &entry);
+
+    set_field_in_reg_u32(iommu->exclusion_enable, entry,
+                         IOMMU_EXCLUSION_RANGE_ENABLE_MASK,
+                         IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_LOW_OFFSET);
+}
+
+static void __init set_iommu_event_log_control(struct amd_iommu *iommu,
+            int enable)
+{
+    u32 entry;
+
+    entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+    set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_EVENT_LOG_ENABLE_MASK,
+                         IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+
+    set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_EVENT_LOG_INT_MASK,
+                         IOMMU_CONTROL_EVENT_LOG_INT_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+
+    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_COMP_WAIT_INT_MASK,
+                         IOMMU_CONTROL_COMP_WAIT_INT_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+
+    /*reset head and tail pointer */
+    writel(0x0, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET);
+    writel(0x0, iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET);
+}
+
+static int amd_iommu_read_event_log(struct amd_iommu *iommu, u32 event[])
+{
+    u32 tail, head, *event_log;
+    int i;
+
+     BUG_ON( !iommu || !event );
+
+    /* make sure there's an entry in the log */
+    tail = get_field_from_reg_u32(
+                readl(iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET),
+                IOMMU_EVENT_LOG_TAIL_MASK,
+                IOMMU_EVENT_LOG_TAIL_SHIFT);
+    if ( tail != iommu->event_log_head )
+    {
+        /* read event log entry */
+        event_log = (u32 *)(iommu->event_log.buffer +
+                                        (iommu->event_log_head *
+                                        IOMMU_EVENT_LOG_ENTRY_SIZE));
+        for ( i = 0; i < IOMMU_EVENT_LOG_U32_PER_ENTRY; i++ )
+            event[i] = event_log[i];
+        if ( ++iommu->event_log_head == iommu->event_log.entries )
+            iommu->event_log_head = 0;
+
+        /* update head pointer */
+        set_field_in_reg_u32(iommu->event_log_head, 0,
+                             IOMMU_EVENT_LOG_HEAD_MASK,
+                             IOMMU_EVENT_LOG_HEAD_SHIFT, &head);
+        writel(head, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET);
+        return 0;
+    }
+
+    return -EFAULT;
+}
+
+static void amd_iommu_msi_data_init(struct amd_iommu *iommu)
+{
+    u32 msi_data;
+    u8 bus = (iommu->bdf >> 8) & 0xff;
+    u8 dev = PCI_SLOT(iommu->bdf & 0xff);
+    u8 func = PCI_FUNC(iommu->bdf & 0xff);
+    int vector = iommu->vector;
+
+    msi_data = MSI_DATA_TRIGGER_EDGE |
+        MSI_DATA_LEVEL_ASSERT |
+        MSI_DATA_DELIVERY_FIXED |
+        MSI_DATA_VECTOR(vector);
+
+    pci_conf_write32(bus, dev, func,
+        iommu->msi_cap + PCI_MSI_DATA_64, msi_data);
+}
+
+static void amd_iommu_msi_addr_init(struct amd_iommu *iommu, int phy_cpu)
+{
+
+    int bus = (iommu->bdf >> 8) & 0xff;
+    int dev = PCI_SLOT(iommu->bdf & 0xff);
+    int func = PCI_FUNC(iommu->bdf & 0xff);
+
+    u32 address_hi = 0;
+    u32 address_lo = MSI_ADDR_HEADER |
+            MSI_ADDR_DESTMODE_PHYS |
+            MSI_ADDR_REDIRECTION_CPU |
+            MSI_ADDR_DEST_ID(phy_cpu);
+
+    pci_conf_write32(bus, dev, func,
+        iommu->msi_cap + PCI_MSI_ADDRESS_LO, address_lo);
+    pci_conf_write32(bus, dev, func,
+        iommu->msi_cap + PCI_MSI_ADDRESS_HI, address_hi);
+}
+
+static void amd_iommu_msi_enable(struct amd_iommu *iommu, int flag)
+{
+    u16 control;
+    int bus = (iommu->bdf >> 8) & 0xff;
+    int dev = PCI_SLOT(iommu->bdf & 0xff);
+    int func = PCI_FUNC(iommu->bdf & 0xff);
+
+    control = pci_conf_read16(bus, dev, func,
+        iommu->msi_cap + PCI_MSI_FLAGS);
+    control &= ~(1);
+    if ( flag )
+        control |= flag;
+    pci_conf_write16(bus, dev, func,
+        iommu->msi_cap + PCI_MSI_FLAGS, control);
+}
+
+static void iommu_msi_unmask(unsigned int vector)
+{
+    unsigned long flags;
+    struct amd_iommu *iommu = vector_to_iommu[vector];
+
+    /* FIXME: do not support mask bits at the moment */
+    if ( iommu->maskbit )
+        return;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void iommu_msi_mask(unsigned int vector)
+{
+    unsigned long flags;
+    struct amd_iommu *iommu = vector_to_iommu[vector];
+
+    /* FIXME: do not support mask bits at the moment */
+    if ( iommu->maskbit )
+        return;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static unsigned int iommu_msi_startup(unsigned int vector)
+{
+    iommu_msi_unmask(vector);
+    return 0;
+}
+
+static void iommu_msi_end(unsigned int vector)
+{
+    iommu_msi_unmask(vector);
+    ack_APIC_irq();
+}
+
+static void iommu_msi_set_affinity(unsigned int vector, cpumask_t dest)
+{
+    struct amd_iommu *iommu = vector_to_iommu[vector];
+    amd_iommu_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
+}
+
+static struct hw_interrupt_type iommu_msi_type = {
+    .typename = "AMD_IOV_MSI",
+    .startup = iommu_msi_startup,
+    .shutdown = iommu_msi_mask,
+    .enable = iommu_msi_unmask,
+    .disable = iommu_msi_mask,
+    .ack = iommu_msi_mask,
+    .end = iommu_msi_end,
+    .set_affinity = iommu_msi_set_affinity,
+};
+
+static void parse_event_log_entry(u32 entry[])
+{
+    u16 domain_id, device_id, bdf, cword;
+    u32 code;
+    u64 *addr;
+    char * event_str[] = {"ILLEGAL_DEV_TABLE_ENTRY",
+                          "IO_PAGE_FALT",
+                          "DEV_TABLE_HW_ERROR",
+                          "PAGE_TABLE_HW_ERROR",
+                          "ILLEGAL_COMMAND_ERROR",
+                          "COMMAND_HW_ERROR",
+                          "IOTLB_INV_TIMEOUT",
+                          "INVALID_DEV_REQUEST"};
+
+    code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK,
+                                            IOMMU_EVENT_CODE_SHIFT);
+
+    if ( (code > IOMMU_EVENT_INVALID_DEV_REQUEST) ||
+        (code < IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY) )
+    {
+        AMD_IOMMU_DEBUG("Invalid event log entry!\n");
+        return;
+    }
+
+    if ( code == IOMMU_EVENT_IO_PAGE_FALT )
+    {
+        device_id = get_field_from_reg_u32(entry[0],
+                                           IOMMU_EVENT_DEVICE_ID_MASK,
+                                           IOMMU_EVENT_DEVICE_ID_SHIFT);
+        domain_id = get_field_from_reg_u32(entry[1],
+                                           IOMMU_EVENT_DOMAIN_ID_MASK,
+                                           IOMMU_EVENT_DOMAIN_ID_SHIFT);
+        addr= (u64*) (entry + 2);
+        printk(XENLOG_ERR "AMD-Vi: "
+            "%s: domain:%d, device id:0x%x, fault address:0x%"PRIx64"\n",
+            event_str[code-1], domain_id, device_id, *addr);
+
+        /* Tell the device to stop DMAing; we can't rely on the guest to
+         * control it for us. */
+        for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
+            if ( get_dma_requestor_id(bdf) == device_id )
+            {
+                cword = pci_conf_read16(PCI_BUS(bdf), PCI_SLOT(bdf),
+                                PCI_FUNC(bdf), PCI_COMMAND);
+                pci_conf_write16(PCI_BUS(bdf), PCI_SLOT(bdf),
+                                 PCI_FUNC(bdf), PCI_COMMAND,
+                                 cword & ~PCI_COMMAND_MASTER);
+            }
+    }
+}
+
+static void amd_iommu_page_fault(int vector, void *dev_id,
+                             struct cpu_user_regs *regs)
+{
+    u32 event[4];
+    u32 entry;
+    unsigned long flags;
+    int ret = 0;
+    struct amd_iommu *iommu = dev_id;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    ret = amd_iommu_read_event_log(iommu, event);
+    /* reset interrupt status bit */
+    entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_STATUS_EVENT_LOG_INT_MASK,
+                         IOMMU_STATUS_EVENT_LOG_INT_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_STATUS_MMIO_OFFSET);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
+    if ( ret != 0 )
+        return;
+    parse_event_log_entry(event);
+}
+
+static int set_iommu_interrupt_handler(struct amd_iommu *iommu)
+{
+    int vector, ret;
+
+    vector = assign_irq_vector(AUTO_ASSIGN);
+
+    if ( !vector )
+    {
+        AMD_IOMMU_DEBUG("no vectors\n");
+        return 0;
+    }
+
+    vector_to_iommu[vector] = iommu;
+
+    /* make irq == vector */
+    irq_vector[vector] = vector;
+    vector_irq[vector] = vector;
+
+    irq_desc[vector].handler = &iommu_msi_type;
+    ret = request_irq(vector, amd_iommu_page_fault, 0, "amd_iommu", iommu);
+    if ( ret )
+    {
+        AMD_IOMMU_DEBUG("can't request irq\n");
+        return 0;
+    }
+    iommu->vector = vector;
+    return vector;
+}
+
+void __init enable_iommu(struct amd_iommu *iommu)
+{
+    unsigned long flags;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+
+    if ( iommu->enabled )
+    {
+        spin_unlock_irqrestore(&iommu->lock, flags); 
+        return;
+    }
+
+    iommu->dev_table.alloc_size = device_table.alloc_size;
+    iommu->dev_table.entries = device_table.entries;
+    iommu->dev_table.buffer = device_table.buffer;
+
+    register_iommu_dev_table_in_mmio_space(iommu);
+    register_iommu_cmd_buffer_in_mmio_space(iommu);
+    register_iommu_event_log_in_mmio_space(iommu);
+    register_iommu_exclusion_range(iommu);
+
+    amd_iommu_msi_data_init (iommu);
+    amd_iommu_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
+    amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
+
+    set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED);
+    set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED);
+    set_iommu_translation_control(iommu, IOMMU_CONTROL_ENABLED);
+
+    printk("AMD-Vi: IOMMU %d Enabled.\n", nr_amd_iommus );
+    nr_amd_iommus++;
+
+    iommu->enabled = 1;
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
+}
+
+static void __init deallocate_iommu_table_struct(
+    struct table_struct *table)
+{
+    int order = 0;
+    if ( table->buffer )
+    {
+        order = get_order_from_bytes(table->alloc_size);
+        __free_amd_iommu_tables(table->buffer, order);
+        table->buffer = NULL;
+    }
+}
+
+static int __init allocate_iommu_table_struct(struct table_struct *table,
+                                              const char *name)
+{
+    int order = 0;
+    if ( table->buffer == NULL )
+    {
+        order = get_order_from_bytes(table->alloc_size);
+        table->buffer = __alloc_amd_iommu_tables(order);
+
+        if ( table->buffer == NULL )
+        {
+            AMD_IOMMU_DEBUG("Error allocating %s\n", name);
+            return -ENOMEM;
+        }
+        memset(table->buffer, 0, PAGE_SIZE * (1UL << order));
+    }
+    return 0;
+}
+
+static int __init allocate_cmd_buffer(struct amd_iommu *iommu)
+{
+    /* allocate 'command buffer' in power of 2 increments of 4K */
+    iommu->cmd_buffer_tail = 0;
+    iommu->cmd_buffer.alloc_size = PAGE_SIZE <<
+        get_order_from_bytes(
+            PAGE_ALIGN(amd_iommu_cmd_buffer_entries *
+                       IOMMU_CMD_BUFFER_ENTRY_SIZE));
+    iommu->cmd_buffer.entries = iommu->cmd_buffer.alloc_size /
+        IOMMU_CMD_BUFFER_ENTRY_SIZE;
+
+    return (allocate_iommu_table_struct(&iommu->cmd_buffer, "Command Buffer"));
+}
+
+static int __init allocate_event_log(struct amd_iommu *iommu)
+{
+    /* allocate 'event log' in power of 2 increments of 4K */
+    iommu->event_log_head = 0;
+    iommu->event_log.alloc_size = PAGE_SIZE <<
+        get_order_from_bytes(
+            PAGE_ALIGN(amd_iommu_event_log_entries *
+                       IOMMU_EVENT_LOG_ENTRY_SIZE));
+    iommu->event_log.entries = iommu->event_log.alloc_size /
+        IOMMU_EVENT_LOG_ENTRY_SIZE;
+
+    return (allocate_iommu_table_struct(&iommu->event_log, "Event Log"));
+}
+
+
+int __init amd_iommu_init_one(struct amd_iommu *iommu)
+{
+
+    if ( allocate_cmd_buffer(iommu) != 0 )
+        goto error_out;
+
+    if ( allocate_event_log(iommu) != 0 )
+        goto error_out;
+
+    if ( map_iommu_mmio_region(iommu) != 0 )
+        goto error_out;
+
+    if ( set_iommu_interrupt_handler(iommu) == 0 )
+        goto error_out;
+
+    enable_iommu(iommu);
+    return 0;
+
+error_out:
+    return -ENODEV;
+}
+
+void __init amd_iommu_init_cleanup(void)
+{
+    struct amd_iommu *iommu, *next;
+    int bdf;
+
+    list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
+    {
+        list_del(&iommu->list);
+        if ( iommu->enabled )
+        {
+            deallocate_iommu_table_struct(&iommu->cmd_buffer);
+            deallocate_iommu_table_struct(&iommu->event_log);
+            unmap_iommu_mmio_region(iommu);
+        }
+        xfree(iommu);
+    }
+
+    /* free interrupt remapping table */
+    for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
+    {
+        if ( ivrs_mappings[bdf].intremap_table )
+            amd_iommu_free_intremap_table(bdf);
+    }
+
+    /* free device table */
+    deallocate_iommu_table_struct(&device_table);
+
+    /* free IVRS_mappings */
+    if ( ivrs_mappings )
+    {
+        xfree(ivrs_mappings);
+        ivrs_mappings = NULL;
+    }
+
+    iommu_enabled = 0;
+    iommu_passthrough = 0;
+    iommu_intremap = 0;
+}
+
+static int __init init_ivrs_mapping(void)
+{
+    int bdf;
+
+    BUG_ON( !ivrs_bdf_entries );
+
+    ivrs_mappings = xmalloc_array( struct ivrs_mappings, ivrs_bdf_entries);
+    if ( ivrs_mappings == NULL )
+    {
+        AMD_IOMMU_DEBUG("Error allocating IVRS Mappings table\n");
+        return -ENOMEM;
+    }
+    memset(ivrs_mappings, 0, ivrs_bdf_entries * sizeof(struct ivrs_mappings));
+
+    /* assign default values for device entries */
+    for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
+    {
+        ivrs_mappings[bdf].dte_requestor_id = bdf;
+        ivrs_mappings[bdf].dte_sys_mgt_enable =
+            IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED;
+        ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].iommu = NULL;
+
+        ivrs_mappings[bdf].intremap_table = NULL;
+        ivrs_mappings[bdf].dte_lint1_pass = IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].dte_lint0_pass = IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].dte_nmi_pass = IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].dte_ext_int_pass = IOMMU_CONTROL_DISABLED;
+        ivrs_mappings[bdf].dte_init_pass = IOMMU_CONTROL_DISABLED;
+
+        if ( amd_iommu_perdev_intremap )
+            spin_lock_init(&ivrs_mappings[bdf].intremap_lock);
+    }
+    return 0;
+}
+
+static int __init amd_iommu_setup_device_table(void)
+{
+    int bdf;
+    void *intr_tb, *dte;
+    int sys_mgt, dev_ex, lint1_pass, lint0_pass, nmi_pass, ext_int_pass,
+        init_pass;
+    
+    BUG_ON(ivrs_bdf_entries == 0);
+    
+    /* allocate 'device table' on a 4K boundary */
+    device_table.alloc_size = PAGE_SIZE << get_order_from_bytes(
+        PAGE_ALIGN(ivrs_bdf_entries * IOMMU_DEV_TABLE_ENTRY_SIZE));
+    device_table.entries = device_table.alloc_size / IOMMU_DEV_TABLE_ENTRY_SIZE;
+
+    if ( allocate_iommu_table_struct(&device_table, "Device Table") != 0 )
+        return -ENOMEM;
+
+    /* add device table entries */
+    for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) 
+    {
+        intr_tb = ivrs_mappings[bdf].intremap_table;
+        
+        if ( intr_tb )
+        {
+            sys_mgt = ivrs_mappings[bdf].dte_sys_mgt_enable;
+            dev_ex = ivrs_mappings[bdf].dte_allow_exclusion;
+            
+            /* get interrupt remapping settings */
+            lint1_pass = ivrs_mappings[bdf].dte_lint1_pass;
+            lint0_pass = ivrs_mappings[bdf].dte_lint0_pass;
+            nmi_pass = ivrs_mappings[bdf].dte_nmi_pass;
+            ext_int_pass = ivrs_mappings[bdf].dte_ext_int_pass;
+            init_pass = ivrs_mappings[bdf].dte_init_pass;
+
+            /* add device table entry */
+            dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
+            amd_iommu_add_dev_table_entry(
+                dte, sys_mgt, dev_ex, lint1_pass, lint0_pass,
+                nmi_pass, ext_int_pass, init_pass);
+
+            amd_iommu_set_intremap_table(
+                dte, (u64)virt_to_maddr(intr_tb), iommu_intremap);
+
+            AMD_IOMMU_DEBUG("Add device table entry at DTE:0x%x, "
+                            "intremap_table:%"PRIx64"\n", bdf,
+                            (u64)virt_to_maddr(intr_tb));
+        }
+    }
+    
+    return 0;
+}
+
+int __init amd_iommu_init(void)
+{
+    struct amd_iommu *iommu;
+
+    BUG_ON( !iommu_found() );
+
+    /* find the max BDF in IRVS table. It will be used in init_ivrs_mapping */
+    ivrs_bdf_entries = amd_iommu_get_ivrs_dev_entries();
+
+    if ( !ivrs_bdf_entries )
+        goto error_out;
+
+    if ( init_ivrs_mapping() != 0 )
+        goto error_out;
+
+    /* start to read and store IVRS info into ivrs_mapping structure */
+    if ( amd_iommu_update_ivrs_mapping_acpi() != 0 )
+        goto error_out;
+
+    /* initialize io-apic interrupt remapping entries */
+    if ( amd_iommu_setup_ioapic_remapping() != 0 )
+        goto error_out;
+    
+    /* allocate and initiliaze a global device table shared by all iommus */
+    if ( amd_iommu_setup_device_table() != 0 )
+        goto error_out;
+
+    for_each_amd_iommu ( iommu )
+        if ( amd_iommu_init_one(iommu) != 0 )
+            goto error_out;
+    return 0;
+
+error_out:
+    amd_iommu_init_cleanup();
+    return -ENODEV;
+}
+
+
diff -Naurp xen/drivers/passthrough/amd/iommu_intr.c xen-redhat/drivers/passthrough/amd/iommu_intr.c
--- xen/drivers/passthrough/amd/iommu_intr.c
+++ xen-redhat/drivers/passthrough/amd/iommu_intr.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Wei Wang <wei.wang2@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/sched.h>
+#include <xen/hvm/iommu.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+
+int ioapic_bdf[MAX_IO_APICS];
+#define INTREMAP_TABLE_ORDER    1
+#define INTREMAP_LENGTH 0xB
+#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
+
+extern struct ivrs_mappings *ivrs_mappings;
+extern unsigned short ivrs_bdf_entries;
+void *shared_intremap_table;
+static DEFINE_SPINLOCK(shared_intremap_lock);
+
+static spinlock_t* get_intremap_lock(int req_id)
+{
+    return (amd_iommu_perdev_intremap ?
+           &ivrs_mappings[req_id].intremap_lock:
+           &shared_intremap_lock);
+}
+
+static int get_intremap_requestor_id(int bdf)
+{
+    ASSERT( bdf < ivrs_bdf_entries );
+    return ivrs_mappings[bdf].dte_requestor_id;
+}
+
+static int get_intremap_offset(u8 vector, u8 dm)
+{
+    int offset = 0;
+    offset = (dm << INT_REMAP_INDEX_DM_SHIFT) & INT_REMAP_INDEX_DM_MASK;
+    offset |= (vector << INT_REMAP_INDEX_VECTOR_SHIFT ) & 
+        INT_REMAP_INDEX_VECTOR_MASK;
+    return offset;
+}
+
+static u8 *get_intremap_entry(int bdf, int offset)
+{
+    u8 *table;
+
+    table = (u8*)ivrs_mappings[bdf].intremap_table;
+    ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) );
+
+    return (u8*) (table + offset);
+}
+
+static void free_intremap_entry(int bdf, int offset)
+{
+    u32* entry;
+    entry = (u32*)get_intremap_entry(bdf, offset);
+    memset(entry, 0, sizeof(u32));
+}
+
+static void update_intremap_entry(u32* entry, u8 vector, u8 int_type,
+    u8 dest_mode, u8 dest)
+{
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
+                            INT_REMAP_ENTRY_REMAPEN_MASK,
+                            INT_REMAP_ENTRY_REMAPEN_SHIFT, entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry,
+                            INT_REMAP_ENTRY_SUPIOPF_MASK,
+                            INT_REMAP_ENTRY_SUPIOPF_SHIFT, entry);
+    set_field_in_reg_u32(int_type, *entry,
+                            INT_REMAP_ENTRY_INTTYPE_MASK,
+                            INT_REMAP_ENTRY_INTTYPE_SHIFT, entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry,
+                            INT_REMAP_ENTRY_REQEOI_MASK,
+                            INT_REMAP_ENTRY_REQEOI_SHIFT, entry);
+    set_field_in_reg_u32((u32)dest_mode, *entry,
+                            INT_REMAP_ENTRY_DM_MASK,
+                            INT_REMAP_ENTRY_DM_SHIFT, entry);
+    set_field_in_reg_u32((u32)dest, *entry,
+                            INT_REMAP_ENTRY_DEST_MAST,
+                            INT_REMAP_ENTRY_DEST_SHIFT, entry);
+    set_field_in_reg_u32((u32)vector, *entry,
+                            INT_REMAP_ENTRY_VECTOR_MASK,
+                            INT_REMAP_ENTRY_VECTOR_SHIFT, entry);
+}
+
+void invalidate_interrupt_table(struct amd_iommu *iommu, u16 device_id)
+{
+    u32 cmd[4], entry;
+
+    cmd[3] = cmd[2] = 0;
+    set_field_in_reg_u32(device_id, 0,
+                         IOMMU_INV_INT_TABLE_DEVICE_ID_MASK,
+                         IOMMU_INV_INT_TABLE_DEVICE_ID_SHIFT, &entry);
+    cmd[0] = entry;
+    set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_INT_TABLE, 0,
+                         IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT,
+                         &entry);
+    cmd[1] = entry;
+    send_iommu_command(iommu, cmd);
+}
+
+static void update_intremap_entry_from_ioapic(
+    int bdf,
+    struct amd_iommu *iommu,
+    struct IO_APIC_route_entry *ioapic_rte,
+    unsigned int rte_upper, unsigned int value)
+{
+    unsigned long flags;
+    u32* entry;
+    u8 delivery_mode, dest, vector, dest_mode;
+    struct IO_APIC_route_entry *rte = ioapic_rte;
+    int req_id;
+    spinlock_t *lock;
+    int offset;
+
+    req_id = get_intremap_requestor_id(bdf);
+    lock = get_intremap_lock(req_id);
+    /* only remap interrupt vector when lower 32 bits in ioapic ire changed */
+    if ( rte_upper )
+    {
+        delivery_mode = rte->delivery_mode;
+        vector = rte->vector;
+        dest_mode = rte->dest_mode;
+        dest = rte->dest.logical.logical_dest;
+        
+        spin_lock_irqsave(lock, flags);
+        offset = get_intremap_offset(vector, delivery_mode);
+        entry = (u32*)get_intremap_entry(req_id, offset);
+
+        update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
+        spin_unlock_irqrestore(lock, flags);
+        
+        if ( iommu->enabled )
+        {
+            spin_lock_irqsave(&iommu->lock, flags);
+            invalidate_interrupt_table(iommu, req_id);
+            flush_command_buffer(iommu);
+            spin_unlock_irqrestore(&iommu->lock, flags);
+        }
+    }
+}
+
+
+extern int nr_ioapic_registers[MAX_IO_APICS];
+extern int nr_ioapics;
+
+int __init amd_iommu_setup_ioapic_remapping(void)
+{
+    struct IO_APIC_route_entry rte = {0};
+    unsigned long flags;
+    u32* entry;
+    int apic, pin;
+    u8 delivery_mode, dest, vector, dest_mode;
+    u16 bdf, req_id, bus, devfn;
+    struct amd_iommu *iommu;
+    spinlock_t *lock;
+    int offset;
+
+    /* Read ioapic entries and update interrupt remapping table accordingly */
+    for ( apic = 0; apic < nr_ioapics; apic++ )
+    {
+        for ( pin = 0; pin < nr_ioapic_registers[apic]; pin++ )
+        {
+            *(((int *)&rte) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+            *(((int *)&rte) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+
+            if ( rte.mask == 1 )
+                continue;
+
+            bdf = ioapic_bdf[IO_APIC_ID(apic)];
+            bus = bdf >> 8;
+            devfn = bdf & 0xFF;
+            iommu = find_iommu_for_device(bus, devfn);
+            
+            if ( !iommu )
+            {
+                AMD_IOMMU_DEBUG("failed to find iommu for ioapic device "
+                                "id = 0x%x\n", bdf);
+                continue;
+            }
+            
+            req_id = get_intremap_requestor_id(bdf);
+            lock = get_intremap_lock(req_id);
+
+            delivery_mode = rte.delivery_mode;
+            vector = rte.vector;
+            dest_mode = rte.dest_mode;
+            dest = rte.dest.logical.logical_dest;
+
+            spin_lock_irqsave(lock, flags);
+            offset = get_intremap_offset(vector, delivery_mode);
+            entry = (u32*)get_intremap_entry(req_id, offset);
+            update_intremap_entry(entry, vector, delivery_mode, dest_mode, 
+                                  dest);
+            spin_unlock_irqrestore(lock, flags);
+
+            if ( iommu->enabled )
+              {
+                  spin_lock_irqsave(&iommu->lock, flags);
+                  invalidate_interrupt_table(iommu, req_id);
+                  flush_command_buffer(iommu);
+                  spin_unlock_irqrestore(&iommu->lock, flags);
+              }
+        }
+    }
+
+    return 0;
+}
+
+void amd_iommu_ioapic_update_ire(
+    unsigned int apic, unsigned int reg, unsigned int value)
+{
+    struct IO_APIC_route_entry ioapic_rte = { 0 };
+    unsigned int rte_upper = (reg & 1) ? 1 : 0;
+    int saved_mask;
+    u16 bus, devfn, bdf;
+    struct amd_iommu *iommu;
+
+    *IO_APIC_BASE(apic) = reg;
+    *(IO_APIC_BASE(apic)+4) = value;
+
+    if ( !iommu_intremap )
+        return;
+
+    /* get device id of ioapic devices */
+    bdf = ioapic_bdf[IO_APIC_ID(apic)];
+    bus = bdf >> 8;
+    devfn = bdf & 0xFF;
+    iommu = find_iommu_for_device(bus, devfn);
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG(
+            "Fail to find iommu for ioapic device id = 0x%x\n", bdf);
+        return;
+    }
+
+    if ( !rte_upper )
+        return;
+
+    reg--;
+    /* read both lower and upper 32-bits of rte entry */
+    *IO_APIC_BASE(apic) = reg;
+    *(((u32 *)&ioapic_rte) + 0) = *(IO_APIC_BASE(apic)+4);
+    *IO_APIC_BASE(apic) = reg + 1;
+    *(((u32 *)&ioapic_rte) + 1) = *(IO_APIC_BASE(apic)+4);
+
+    /* mask the interrupt while we change the intremap table */
+    saved_mask = ioapic_rte.mask;
+    ioapic_rte.mask = 1;
+    *IO_APIC_BASE(apic) = reg;
+    *(IO_APIC_BASE(apic)+4) = *(((int *)&ioapic_rte)+0);
+    ioapic_rte.mask = saved_mask;
+
+    
+    update_intremap_entry_from_ioapic(bdf, iommu, 
+                                      &ioapic_rte, rte_upper, value);
+
+    /* unmask the interrupt after we have updated the intremap table */
+    *IO_APIC_BASE(apic) = reg;
+    *(IO_APIC_BASE(apic)+4) = *(((u32 *)&ioapic_rte)+0);
+}
+
+static void update_intremap_entry_from_msi_msg(
+    struct amd_iommu *iommu, struct pci_dev *pdev,
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    unsigned long flags;
+    u32* entry;
+    u16 dev_id, alias_id, bus, devfn, req_id;
+
+    u8 delivery_mode, dest, vector, dest_mode;
+    spinlock_t *lock;
+    int offset;
+
+    dev_id = (pdev->bus << 8) | pdev->devfn;
+    bus = pdev->bus;
+    devfn = pdev->devfn;
+    req_id = get_dma_requestor_id(dev_id);
+    alias_id = get_intremap_requestor_id(dev_id);
+
+    if ( msg == NULL )
+    {
+        lock = get_intremap_lock(req_id);
+        spin_lock_irqsave(lock, flags);
+        free_intremap_entry(req_id, msi_desc->remap_index);
+        spin_unlock_irqrestore(lock, flags);
+
+        if ( ( req_id != alias_id ) &&
+            ivrs_mappings[alias_id].intremap_table != NULL )
+        {
+            lock = get_intremap_lock(alias_id);
+            spin_lock_irqsave(lock, flags);
+            free_intremap_entry(alias_id, msi_desc->remap_index);
+            spin_unlock_irqrestore(lock, flags);
+        }
+        goto done;
+    }
+
+    lock = get_intremap_lock(req_id);
+
+    spin_lock_irqsave(lock, flags);
+    dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
+    delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
+    vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK;
+    dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff;
+    offset = get_intremap_offset(vector, delivery_mode);
+    msi_desc->remap_index = offset;
+
+    entry = (u32*)get_intremap_entry(req_id, offset);
+    update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
+    spin_unlock_irqrestore(lock, flags);
+    
+    /*
+     * In some special cases, a pci-e device(e.g SATA controller in IDE mode)
+     * will use alias id to index interrupt remapping table.
+     * We have to setup a secondary interrupt remapping entry to satisfy those
+     * devices.
+     */
+
+    lock = get_intremap_lock(alias_id);
+    if ( ( req_id != alias_id ) &&
+         ivrs_mappings[alias_id].intremap_table != NULL )
+    {
+        spin_lock_irqsave(lock, flags);
+        entry = (u32*)get_intremap_entry(alias_id, offset);
+        update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
+        spin_unlock_irqrestore(lock, flags);
+    }
+
+done:
+    if ( iommu->enabled )
+    {
+        spin_lock_irqsave(&iommu->lock, flags);
+        invalidate_interrupt_table(iommu, dev_id);
+        if ( alias_id != req_id )
+            invalidate_interrupt_table(iommu, alias_id);
+        flush_command_buffer(iommu);
+        spin_unlock_irqrestore(&iommu->lock, flags);
+    }
+
+    return;
+}
+
+void amd_iommu_msi_msg_update_ire(
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    struct pci_dev *pdev = msi_desc->dev;
+    struct amd_iommu *iommu = NULL;
+
+    if ( !iommu_intremap )
+        return;
+
+    iommu = find_iommu_for_device(pdev->bus, pdev->devfn);
+
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG(
+            "Fail to find iommu for MSI device id = 0x%x\n",
+            (pdev->bus << 8) | pdev->devfn);
+        return;
+    }       
+
+    update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, msg);
+}
+
+void __init amd_iommu_free_intremap_table(int bdf)
+{
+    void *tb = ivrs_mappings[bdf].intremap_table;
+
+    if ( tb )
+    {
+        __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER);
+        ivrs_mappings[bdf].intremap_table = NULL;
+    }
+}
+
+void* __init amd_iommu_alloc_intremap_table(void)
+{
+    void *tb;
+    tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER);
+    BUG_ON(tb == NULL);
+    memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER));
+    return tb;
+}
+
diff -Naurp xen/drivers/passthrough/amd/iommu_map.c xen-redhat/drivers/passthrough/amd/iommu_map.c
--- xen/drivers/passthrough/amd/iommu_map.c
+++ xen-redhat/drivers/passthrough/amd/iommu_map.c
@@ -0,0 +1,659 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/sched.h>
+#include <xen/hvm/iommu.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+
+static int queue_iommu_command(struct amd_iommu *iommu, u32 cmd[])
+{
+    u32 tail, head, *cmd_buffer;
+    int i;
+
+    tail = iommu->cmd_buffer_tail;
+    if ( ++tail == iommu->cmd_buffer.entries )
+        tail = 0;
+    head = get_field_from_reg_u32(
+        readl(iommu->mmio_base+IOMMU_CMD_BUFFER_HEAD_OFFSET),
+        IOMMU_CMD_BUFFER_HEAD_MASK,
+        IOMMU_CMD_BUFFER_HEAD_SHIFT);
+    if ( head != tail )
+    {
+        cmd_buffer = (u32 *)(iommu->cmd_buffer.buffer +
+                             (iommu->cmd_buffer_tail *
+                              IOMMU_CMD_BUFFER_ENTRY_SIZE));
+        for ( i = 0; i < IOMMU_CMD_BUFFER_U32_PER_ENTRY; i++ )
+            cmd_buffer[i] = cmd[i];
+
+        iommu->cmd_buffer_tail = tail;
+        return 1;
+    }
+
+    return 0;
+}
+
+static void commit_iommu_command_buffer(struct amd_iommu *iommu)
+{
+    u32 tail;
+
+    set_field_in_reg_u32(iommu->cmd_buffer_tail, 0,
+                         IOMMU_CMD_BUFFER_TAIL_MASK,
+                         IOMMU_CMD_BUFFER_TAIL_SHIFT, &tail);
+    writel(tail, iommu->mmio_base+IOMMU_CMD_BUFFER_TAIL_OFFSET);
+}
+
+int send_iommu_command(struct amd_iommu *iommu, u32 cmd[])
+{
+    if ( queue_iommu_command(iommu, cmd) )
+    {
+        commit_iommu_command_buffer(iommu);
+        return 1;
+    }
+
+    return 0;
+}
+
+static void invalidate_iommu_page(struct amd_iommu *iommu,
+                                  u64 io_addr, u16 domain_id)
+{
+    u64 addr_lo, addr_hi;
+    u32 cmd[4], entry;
+
+    addr_lo = io_addr & DMA_32BIT_MASK;
+    addr_hi = io_addr >> 32;
+
+    set_field_in_reg_u32(domain_id, 0,
+                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK,
+                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry,
+                         IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT,
+                         &entry);
+    cmd[1] = entry;
+
+    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, 0,
+                         IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK,
+                         IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK,
+                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry);
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry);
+    cmd[2] = entry;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry);
+    cmd[3] = entry;
+
+    cmd[0] = 0;
+    send_iommu_command(iommu, cmd);
+}
+
+void flush_command_buffer(struct amd_iommu *iommu)
+{
+    u32 cmd[4], status;
+    int loop_count, comp_wait;
+
+    /* clear 'ComWaitInt' in status register (WIC) */
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
+                         IOMMU_STATUS_COMP_WAIT_INT_MASK,
+                         IOMMU_STATUS_COMP_WAIT_INT_SHIFT, &status);
+    writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
+
+    /* send an empty COMPLETION_WAIT command to flush command buffer */
+    cmd[3] = cmd[2] = 0;
+    set_field_in_reg_u32(IOMMU_CMD_COMPLETION_WAIT, 0,
+                         IOMMU_CMD_OPCODE_MASK,
+                         IOMMU_CMD_OPCODE_SHIFT, &cmd[1]);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
+                         IOMMU_COMP_WAIT_I_FLAG_MASK,
+                         IOMMU_COMP_WAIT_I_FLAG_SHIFT, &cmd[0]);
+    send_iommu_command(iommu, cmd);
+
+    /* wait for 'ComWaitInt' to signal comp#endifletion? */
+    loop_count = 1000;
+    do {
+        status = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
+        comp_wait = get_field_from_reg_u32(status,
+            IOMMU_STATUS_COMP_WAIT_INT_MASK,
+            IOMMU_STATUS_COMP_WAIT_INT_SHIFT);
+        --loop_count;
+    } while ( !comp_wait && loop_count );
+
+    if ( comp_wait )
+    {
+        /* clear 'ComWaitInt' in status register (WIC) */
+        status &= IOMMU_STATUS_COMP_WAIT_INT_MASK;
+        writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
+        return;
+    }
+    AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n");
+}
+
+static void clear_iommu_l1e_present(u64 l2e, unsigned long gfn)
+{
+    u32 *l1e;
+    int offset;
+    void *l1_table;
+
+    l1_table = map_domain_page(l2e >> PAGE_SHIFT);
+
+    offset = gfn & (~PTE_PER_TABLE_MASK);
+    l1e = (u32*)(l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
+
+    /* clear l1 entry */
+    l1e[0] = l1e[1] = 0;
+
+    unmap_domain_page(l1_table);
+}
+
+static void set_iommu_l1e_present(u64 l2e, unsigned long gfn,
+                                 u64 maddr, int iw, int ir)
+{
+    u64 addr_lo, addr_hi;
+    u32 entry;
+    void *l1_table;
+    int offset;
+    u32 *l1e;
+
+    l1_table = map_domain_page(l2e >> PAGE_SHIFT);
+
+    offset = gfn & (~PTE_PER_TABLE_MASK);
+    l1e = (u32*)((u8*)l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
+
+    addr_lo = maddr & DMA_32BIT_MASK;
+    addr_hi = maddr >> 32;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_PTE_ADDR_HIGH_MASK,
+                         IOMMU_PTE_ADDR_HIGH_SHIFT, &entry);
+    set_field_in_reg_u32(iw ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_PTE_IO_WRITE_PERMISSION_MASK,
+                         IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT, &entry);
+    set_field_in_reg_u32(ir ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_PTE_IO_READ_PERMISSION_MASK,
+                         IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry);
+    l1e[1] = entry;
+
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_PTE_ADDR_LOW_MASK,
+                         IOMMU_PTE_ADDR_LOW_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_PAGING_MODE_LEVEL_0, entry,
+                         IOMMU_PTE_NEXT_LEVEL_MASK,
+                         IOMMU_PTE_NEXT_LEVEL_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_PTE_PRESENT_MASK,
+                         IOMMU_PTE_PRESENT_SHIFT, &entry);
+    l1e[0] = entry;
+
+    unmap_domain_page(l1_table);
+}
+
+static void amd_iommu_set_page_directory_entry(u32 *pde, 
+                                               u64 next_ptr, u8 next_level)
+{
+    u64 addr_lo, addr_hi;
+    u32 entry;
+
+    addr_lo = next_ptr & DMA_32BIT_MASK;
+    addr_hi = next_ptr >> 32;
+
+    /* enable read/write permissions,which will be enforced at the PTE */
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_PDE_ADDR_HIGH_MASK,
+                         IOMMU_PDE_ADDR_HIGH_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_PDE_IO_WRITE_PERMISSION_MASK,
+                         IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_PDE_IO_READ_PERMISSION_MASK,
+                         IOMMU_PDE_IO_READ_PERMISSION_SHIFT, &entry);
+    pde[1] = entry;
+
+    /* mark next level as 'present' */
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_PDE_ADDR_LOW_MASK,
+                         IOMMU_PDE_ADDR_LOW_SHIFT, &entry);
+    set_field_in_reg_u32(next_level, entry,
+                         IOMMU_PDE_NEXT_LEVEL_MASK,
+                         IOMMU_PDE_NEXT_LEVEL_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_PDE_PRESENT_MASK,
+                         IOMMU_PDE_PRESENT_SHIFT, &entry);
+    pde[0] = entry;
+}
+
+void amd_iommu_set_root_page_table(
+    u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid)
+{
+    u64 addr_hi, addr_lo;
+    u32 entry;
+    set_field_in_reg_u32(domain_id, 0,
+                         IOMMU_DEV_TABLE_DOMAIN_ID_MASK,
+                         IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry);
+    dte[2] = entry;
+    
+    addr_lo = root_ptr & DMA_32BIT_MASK;
+    addr_hi = root_ptr >> 32;
+    
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
+                         IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK,
+                         IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK,
+                         IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry);
+    dte[1] = entry;
+    
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+                         IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
+                         IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry);
+    set_field_in_reg_u32(paging_mode, entry,
+                         IOMMU_DEV_TABLE_PAGING_MODE_MASK,
+                         IOMMU_DEV_TABLE_PAGING_MODE_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK,
+                         IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &entry);
+    set_field_in_reg_u32(valid ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_DEV_TABLE_VALID_MASK,
+                         IOMMU_DEV_TABLE_VALID_SHIFT, &entry);
+    dte[0] = entry;
+}
+
+void amd_iommu_set_intremap_table(u32 *dte, u64 intremap_ptr, u8 int_valid)
+{
+    u64 addr_hi, addr_lo;
+    u32 entry;
+    
+    addr_lo = intremap_ptr & DMA_32BIT_MASK;
+    addr_hi = intremap_ptr >> 32;
+    
+    entry = dte[5];
+    set_field_in_reg_u32((u32)addr_hi, entry,
+                         IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK,
+                         IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT, &entry);
+    /* Fixed and arbitrated interrupts remapepd */
+    set_field_in_reg_u32(2, entry,
+                         IOMMU_DEV_TABLE_INT_CONTROL_MASK,
+                         IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry);
+    dte[5] = entry;
+    
+    set_field_in_reg_u32((u32)addr_lo >> 6, 0,
+                         IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK,
+                         IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT, &entry);
+    /* 2048 entries */
+    set_field_in_reg_u32(0xB, entry,
+                         IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK,
+                         IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT, &entry);
+    /* ignore unmapped interrupts */
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_MASK,
+                         IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_SHIFT, &entry);
+    set_field_in_reg_u32(int_valid ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_DEV_TABLE_INT_VALID_MASK,
+                         IOMMU_DEV_TABLE_INT_VALID_SHIFT, &entry);
+    dte[4] = entry;
+}
+
+void amd_iommu_add_dev_table_entry(
+     u32 *dte, u8 sys_mgt, u8 dev_ex, u8 lint1_pass, u8 lint0_pass,
+     u8 nmi_pass, u8 ext_int_pass, u8 init_pass)
+{
+  u32 entry;
+
+  dte[7] = dte[6] = dte[4] = dte[2] = dte[1] = dte[0] = 0;
+
+
+  set_field_in_reg_u32(init_pass ? IOMMU_CONTROL_ENABLED :
+                       IOMMU_CONTROL_DISABLED, 0,
+                       IOMMU_DEV_TABLE_INIT_PASSTHRU_MASK,
+                       IOMMU_DEV_TABLE_INIT_PASSTHRU_SHIFT, &entry);
+  set_field_in_reg_u32(ext_int_pass ? IOMMU_CONTROL_ENABLED :
+                       IOMMU_CONTROL_DISABLED, entry,
+                       IOMMU_DEV_TABLE_EINT_PASSTHRU_MASK,
+                       IOMMU_DEV_TABLE_EINT_PASSTHRU_SHIFT, &entry);
+  set_field_in_reg_u32(nmi_pass ? IOMMU_CONTROL_ENABLED :
+                       IOMMU_CONTROL_DISABLED, entry,
+                       IOMMU_DEV_TABLE_NMI_PASSTHRU_MASK,
+                       IOMMU_DEV_TABLE_NMI_PASSTHRU_SHIFT, &entry);
+  set_field_in_reg_u32(lint0_pass ? IOMMU_CONTROL_ENABLED :
+                       IOMMU_CONTROL_DISABLED, entry,
+                       IOMMU_DEV_TABLE_LINT0_ENABLE_MASK,
+                       IOMMU_DEV_TABLE_LINT0_ENABLE_SHIFT, &entry);
+  set_field_in_reg_u32(lint1_pass ? IOMMU_CONTROL_ENABLED :
+                       IOMMU_CONTROL_DISABLED, entry,
+                       IOMMU_DEV_TABLE_LINT1_ENABLE_MASK,
+                       IOMMU_DEV_TABLE_LINT1_ENABLE_SHIFT, &entry);
+  dte[5] = entry;
+
+  set_field_in_reg_u32(sys_mgt, 0,
+                       IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK,
+                       IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT, &entry);
+  set_field_in_reg_u32(dev_ex, entry,
+                       IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK,
+                       IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT, &entry);
+  dte[3] = entry;
+}
+
+
+
+u64 amd_iommu_get_next_table_from_pte(u32 *entry)
+{
+    u64 addr_lo, addr_hi, ptr;
+
+    addr_lo = get_field_from_reg_u32(
+        entry[0],
+        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
+        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT);
+
+    addr_hi = get_field_from_reg_u32(
+        entry[1],
+        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
+        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT);
+
+    ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
+    return ptr;
+}
+
+static int amd_iommu_is_pte_present(u32 *entry)
+{
+    return (get_field_from_reg_u32(entry[0],
+                                   IOMMU_PDE_PRESENT_MASK,
+                                   IOMMU_PDE_PRESENT_SHIFT));
+}
+
+void invalidate_dev_table_entry(struct amd_iommu *iommu,
+                                u16 device_id)
+{
+    u32 cmd[4], entry;
+
+    cmd[3] = cmd[2] = 0;
+    set_field_in_reg_u32(device_id, 0,
+                         IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_MASK,
+                         IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_SHIFT, &entry);
+    cmd[0] = entry;
+
+    set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY, 0,
+                         IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT,
+                         &entry);
+    cmd[1] = entry;
+
+    send_iommu_command(iommu, cmd);
+}
+
+int amd_iommu_is_dte_page_translation_valid(u32 *entry)
+{
+    return (get_field_from_reg_u32(entry[0],
+                                   IOMMU_DEV_TABLE_VALID_MASK,
+                                   IOMMU_DEV_TABLE_VALID_SHIFT) &&
+            get_field_from_reg_u32(entry[0],
+                                   IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK,
+                                   IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT));
+}
+
+static u64 iommu_l2e_from_pfn(struct page_info *table, int level,
+                              unsigned long io_pfn)
+{
+    unsigned long offset;
+    void *pde = NULL;
+    void *table_vaddr;
+    u64 next_table_maddr = 0;
+
+    BUG_ON( table == NULL || level == 0 );
+
+    while ( level > 1 )
+    {
+        offset = io_pfn >> ((PTE_PER_TABLE_SHIFT *
+                             (level - IOMMU_PAGING_MODE_LEVEL_1)));
+        offset &= ~PTE_PER_TABLE_MASK;
+
+        table_vaddr = map_domain_page(page_to_mfn(table));
+        pde = table_vaddr + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE);
+        next_table_maddr = amd_iommu_get_next_table_from_pte(pde);
+
+        if ( !amd_iommu_is_pte_present(pde) )
+        {
+            if ( next_table_maddr == 0 )
+            {
+                table = alloc_amd_iommu_pgtable();
+                if ( table == NULL )
+                    return 0;
+                next_table_maddr = page_to_maddr(table);
+                amd_iommu_set_page_directory_entry(
+                    (u32 *)pde, next_table_maddr, level - 1);
+            }
+            else /* should never reach here */
+                return 0;
+        }
+
+        unmap_domain_page(table_vaddr);
+        table = maddr_to_page(next_table_maddr);
+        level--;
+    }
+
+    return next_table_maddr;
+}
+
+int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    u64 iommu_l2e;
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    int iw = IOMMU_IO_WRITE_ENABLED;
+    int ir = IOMMU_IO_READ_ENABLED;
+
+    BUG_ON( !hd->root_table );
+
+    spin_lock(&hd->mapping_lock);
+
+    if ( is_hvm_domain(d) && !hd->p2m_synchronized )
+        goto out;
+
+    iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
+    if ( iommu_l2e == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
+        AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn);
+        return -EFAULT;
+    }
+    set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, iw, ir);
+
+out:
+    spin_unlock(&hd->mapping_lock);
+    return 0;
+}
+
+int amd_iommu_unmap_page(struct domain *d, unsigned long gfn)
+{
+    u64 iommu_l2e;
+    unsigned long flags;
+    struct amd_iommu *iommu;
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    BUG_ON( !hd->root_table );
+
+    spin_lock(&hd->mapping_lock);
+
+    if ( is_hvm_domain(d) && !hd->p2m_synchronized )
+    {
+        spin_unlock(&hd->mapping_lock);
+        return 0;
+    }
+
+    iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
+
+    if ( iommu_l2e == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
+        AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn);
+        return -EFAULT;
+    }
+
+    /* mark PTE as 'page not present' */
+    clear_iommu_l1e_present(iommu_l2e, gfn);
+    spin_unlock(&hd->mapping_lock);
+
+    /* send INVALIDATE_IOMMU_PAGES command */
+    for_each_amd_iommu ( iommu )
+    {
+        spin_lock_irqsave(&iommu->lock, flags);
+        invalidate_iommu_page(iommu, (u64)gfn << PAGE_SHIFT, hd->domain_id);
+        flush_command_buffer(iommu);
+        spin_unlock_irqrestore(&iommu->lock, flags);
+    }
+
+    return 0;
+}
+
+int amd_iommu_reserve_domain_unity_map(
+    struct domain *domain,
+    unsigned long phys_addr,
+    unsigned long size, int iw, int ir)
+{
+    u64 iommu_l2e;
+    unsigned long npages, i;
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+
+    npages = region_to_pages(phys_addr, size);
+
+    spin_lock(&hd->mapping_lock);
+    for ( i = 0; i < npages; ++i )
+    {
+        iommu_l2e = iommu_l2e_from_pfn(
+            hd->root_table, hd->paging_mode, phys_addr >> PAGE_SHIFT);
+
+        if ( iommu_l2e == 0 )
+        {
+            spin_unlock(&hd->mapping_lock);
+            AMD_IOMMU_DEBUG("Invalid IO pagetable entry phys_addr = %lx\n",
+                            phys_addr);
+            return -EFAULT;
+        }
+
+        set_iommu_l1e_present(iommu_l2e,
+            (phys_addr >> PAGE_SHIFT), phys_addr, iw, ir);
+
+        phys_addr += PAGE_SIZE;
+    }
+    spin_unlock(&hd->mapping_lock);
+    return 0;
+}
+
+int amd_iommu_sync_p2m(struct domain *d)
+{
+    unsigned long mfn, gfn;
+    u64 iommu_l2e;
+    struct page_info *page;
+    struct hvm_iommu *hd;
+    int iw = IOMMU_IO_WRITE_ENABLED;
+    int ir = IOMMU_IO_READ_ENABLED;
+
+    if ( !is_hvm_domain(d) )
+        return 0;
+
+    hd = domain_hvm_iommu(d);
+
+    spin_lock(&hd->mapping_lock);
+
+    if ( hd->p2m_synchronized )
+        goto out;
+
+    spin_lock(&d->page_alloc_lock);
+
+    list_for_each_entry( page, &d->page_list, list )
+    {
+        mfn = page_to_mfn(page);
+        gfn = get_gpfn_from_mfn(mfn);
+
+        if ( gfn == INVALID_M2P_ENTRY )
+            continue;
+
+        iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
+
+        if ( iommu_l2e == 0 )
+        {
+            spin_unlock(&d->page_alloc_lock);
+            spin_unlock(&hd->mapping_lock);
+            AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn);
+            return -EFAULT;
+        }
+
+        set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, iw, ir);
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    hd->p2m_synchronized = 1;
+
+out:
+    spin_unlock(&hd->mapping_lock);
+    return 0;
+}
+
+void invalidate_all_iommu_pages(struct domain *d)
+{
+    u32 cmd[4], entry;
+    unsigned long flags;
+    struct amd_iommu *iommu;
+    int domain_id = d->domain_id;
+    u64 addr_lo = 0x7FFFFFFFFFFFF000ULL & DMA_32BIT_MASK;
+    u64 addr_hi = 0x7FFFFFFFFFFFF000ULL >> 32;
+
+    set_field_in_reg_u32(domain_id, 0,
+                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK,
+                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry,
+                         IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT,
+                         &entry);
+    cmd[1] = entry;
+
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
+                         IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK,
+                         IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK,
+                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry);
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry);
+    cmd[2] = entry;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry);
+    cmd[3] = entry;
+
+    cmd[0] = 0;
+
+    for_each_amd_iommu ( iommu )
+    {
+        spin_lock_irqsave(&iommu->lock, flags);
+        send_iommu_command(iommu, cmd);
+        flush_command_buffer(iommu);
+        spin_unlock_irqrestore(&iommu->lock, flags);
+    }
+}
diff -Naurp xen/drivers/passthrough/amd/Makefile xen-redhat/drivers/passthrough/amd/Makefile
--- xen/drivers/passthrough/amd/Makefile
+++ xen-redhat/drivers/passthrough/amd/Makefile
@@ -0,0 +1,6 @@
+obj-y += iommu_detect.o
+obj-y += iommu_init.o
+obj-y += iommu_map.o
+obj-y += pci_amd_iommu.o
+obj-y += iommu_acpi.o
+obj-y += iommu_intr.o
diff -Naurp xen/drivers/passthrough/amd/pci_amd_iommu.c xen-redhat/drivers/passthrough/amd/pci_amd_iommu.c
--- xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ xen-redhat/drivers/passthrough/amd/pci_amd_iommu.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <xen/sched.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <asm/amd-iommu.h>
+#include <asm/hvm/svm/amd-iommu-proto.h>
+
+extern unsigned short ivrs_bdf_entries;
+extern struct ivrs_mappings *ivrs_mappings;
+extern void *int_remap_table;
+
+struct amd_iommu *find_iommu_for_device(int bus, int devfn)
+{
+    u16 bdf = (bus << 8) | devfn;
+    BUG_ON ( bdf >= ivrs_bdf_entries );
+    return ivrs_mappings[bdf].iommu;
+}
+
+/*
+ * Some devices will use alias id and original device id to index interrupt
+ * table and I/O page table respectively. Such devices will have
+ * both alias entry and select entry in IVRS structure.                       
+
+ * Return original device id, if device has valid interrupt remapping
+ * table setup for both select entry and alias entry.
+*/
+int get_dma_requestor_id(u16 bdf)
+{
+    int req_id;
+
+    BUG_ON ( bdf >= ivrs_bdf_entries );
+    req_id = ivrs_mappings[bdf].dte_requestor_id;
+    if ( (ivrs_mappings[bdf].intremap_table != NULL) &&
+         (ivrs_mappings[req_id].intremap_table != NULL) )
+        req_id = bdf;
+
+    return req_id;
+}
+
+static void amd_iommu_setup_domain_device(
+    struct domain *domain, struct amd_iommu *iommu, int bdf)
+{
+    void *dte;
+    unsigned long flags;
+    int req_id, valid = 1;
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+
+    BUG_ON( !hd->root_table || !hd->paging_mode || !iommu->dev_table.buffer );
+
+    if ( iommu_passthrough && (domain->domain_id == 0) )
+        valid = 0;
+    
+    /* get device-table entry */
+    req_id = get_dma_requestor_id(bdf);
+    dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    if ( !amd_iommu_is_dte_page_translation_valid((u32 *)dte) )
+    {
+        /* bind DTE to domain page-tables */
+        amd_iommu_set_root_page_table(
+            (u32 *)dte, page_to_maddr(hd->root_table), hd->domain_id,
+            hd->paging_mode, valid);
+
+        invalidate_dev_table_entry(iommu, req_id);
+        flush_command_buffer(iommu);
+        
+        AMD_IOMMU_DEBUG("Setup I/O page table at DTE:0x%x, root_table:%"
+                        PRIx64", domain_id:%d, paging_mode:%d\n", req_id,
+                        (u64)page_to_maddr(hd->root_table), hd->domain_id, 
+                        hd->paging_mode);
+    }
+    spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void amd_iommu_setup_dom0_devices(struct domain *d)
+{
+    struct amd_iommu *iommu;
+    struct pci_dev *pdev;
+    int bus, dev, func;
+    u32 l;
+    int bdf;
+
+    spin_lock(&pcidevs_lock);
+    for ( bus = 0; bus < 256; bus++ )
+    {
+        for ( dev = 0; dev < 32; dev++ )
+        {
+            for ( func = 0; func < 8; func++ )
+            {
+                l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
+                /* some broken boards return 0 or ~0 if a slot is empty: */
+                if ( (l == 0xffffffff) || (l == 0x00000000) ||
+                     (l == 0x0000ffff) || (l == 0xffff0000) )
+                    continue;
+
+                pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
+                pdev->domain = d;
+                list_add(&pdev->domain_list, &d->arch.pdev_list);
+
+                bdf = (bus << 8) | pdev->devfn;
+                /* supported device? */
+                iommu = (bdf < ivrs_bdf_entries) ?
+                    find_iommu_for_device(bus, pdev->devfn) : NULL;
+
+                if ( iommu )
+                    amd_iommu_setup_domain_device(d, iommu, bdf);
+            }
+        }
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+int amd_iov_detect(void)
+{
+    INIT_LIST_HEAD(&amd_iommu_head);
+
+    amd_iommu_detect_acpi();
+
+    if ( !iommu_found() )
+    {
+        printk("AMD-Vi: IOMMU not found!\n");
+        return -ENODEV;
+    }
+
+    if ( amd_iommu_init() != 0 )
+    {
+        printk ("AMD-Vi: Error initialization!\n");
+        return -ENODEV;
+    }
+
+    return 0;
+}
+
+static int allocate_domain_resources(struct hvm_iommu *hd)
+{
+    /* allocate root table */
+    spin_lock(&hd->mapping_lock);
+    if ( !hd->root_table )
+    {
+        hd->root_table = alloc_amd_iommu_pgtable();
+        if ( !hd->root_table )
+        {
+            spin_unlock(&hd->mapping_lock);
+            return -ENOMEM;
+        }
+    }
+    spin_unlock(&hd->mapping_lock);
+    return 0;
+}
+
+static int get_paging_mode(unsigned long entries)
+{
+    int level = 1;
+
+    BUG_ON(!max_page);
+
+    if ( entries > max_page )
+        entries = max_page;
+
+    while ( entries > PTE_PER_TABLE_SIZE )
+    {
+        entries = PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT;
+        if ( ++level > 6 )
+            return -ENOMEM;
+    }
+
+    return level;
+}
+
+static int amd_iommu_domain_init(struct domain *domain)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+
+    /* allocate page directroy */
+    if ( allocate_domain_resources(hd) != 0 )
+    {
+        if ( hd->root_table )
+            free_domheap_page(hd->root_table);
+        return -ENOMEM;
+    }
+
+    hd->paging_mode = is_hvm_domain(domain)?
+        IOMMU_PAGE_TABLE_LEVEL_4 : get_paging_mode(max_page);
+
+    if ( domain->domain_id == 0 )
+    {
+        unsigned long i;
+
+        if ( !iommu_passthrough )
+        {
+            /* setup 1:1 page table for dom0 */
+            for ( i = 0; i < max_page; i++ )
+                amd_iommu_map_page(domain, i, i);
+        }
+
+        amd_iommu_setup_dom0_devices(domain);
+    }
+
+    hd->domain_id = domain->domain_id;
+
+    return 0;
+}
+
+static void amd_iommu_disable_domain_device(
+    struct domain *domain, struct amd_iommu *iommu, int bdf)
+{
+    void *dte;
+    unsigned long flags;
+    int req_id;
+
+    req_id = get_dma_requestor_id(bdf);
+    dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+
+    spin_lock_irqsave(&iommu->lock, flags); 
+    if ( amd_iommu_is_dte_page_translation_valid((u32 *)dte) )
+    {
+        memset (dte, 0, IOMMU_DEV_TABLE_ENTRY_SIZE);
+        invalidate_dev_table_entry(iommu, req_id);
+        flush_command_buffer(iommu);
+        AMD_IOMMU_DEBUG("Disable DTE:0x%x,"
+                        " domain_id:%d, paging_mode:%d\n",
+                        req_id,  domain_hvm_iommu(domain)->domain_id,
+                        domain_hvm_iommu(domain)->paging_mode);
+    }
+    spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static int reassign_device( struct domain *source, struct domain *target,
+                            u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+    struct amd_iommu *iommu;
+    int bdf;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev_by_domain(source, bus, devfn);
+    if ( !pdev )
+        return -ENODEV;
+
+    bdf = (bus << 8) | devfn;
+    /* supported device? */
+    iommu = (bdf < ivrs_bdf_entries) ?
+    find_iommu_for_device(bus, pdev->devfn) : NULL;
+
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("Fail to find iommu."
+                        " %x:%x.%x cannot be assigned to domain %d\n", 
+                        bus, PCI_SLOT(devfn), PCI_FUNC(devfn), 
+                        target->domain_id);
+        return -ENODEV;
+    }
+
+    amd_iommu_disable_domain_device(source, iommu, bdf);
+
+    list_move(&pdev->domain_list, &target->arch.pdev_list);
+    pdev->domain = target;
+
+    amd_iommu_setup_domain_device(target, iommu, bdf);
+    AMD_IOMMU_DEBUG("reassign %x:%x.%x domain %d -> domain %d\n",
+                    bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                    source->domain_id, target->domain_id);
+
+    return 0;
+}
+
+static int amd_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    int bdf = (bus << 8) | devfn;
+    int req_id = get_dma_requestor_id(bdf);
+
+    amd_iommu_sync_p2m(d);
+
+    if ( ivrs_mappings[req_id].unity_map_enable )
+    {
+        amd_iommu_reserve_domain_unity_map(
+            d,
+            ivrs_mappings[req_id].addr_range_start,
+            ivrs_mappings[req_id].addr_range_length,
+            ivrs_mappings[req_id].write_permission,
+            ivrs_mappings[req_id].read_permission);
+    }
+
+    return reassign_device(dom0, d, bus, devfn);
+}
+
+static void deallocate_next_page_table(struct page_info* pg, int level)
+{
+    void *table_vaddr, *pde;
+    u64 next_table_maddr;
+    int index;
+
+    table_vaddr = map_domain_page(page_to_mfn(pg));
+
+    if ( level > 1 )
+    {
+        for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
+        {
+            pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE);
+            next_table_maddr = amd_iommu_get_next_table_from_pte(pde);
+            if ( next_table_maddr != 0 )
+            {
+                deallocate_next_page_table(
+                    maddr_to_page(next_table_maddr), level - 1);
+            }
+        }
+    }
+
+    unmap_domain_page(table_vaddr);
+    free_amd_iommu_pgtable(pg);
+}
+
+static void deallocate_iommu_page_tables(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+
+    spin_lock(&hd->mapping_lock);
+    if ( hd->root_table )
+    {
+        deallocate_next_page_table(hd->root_table, hd->paging_mode);
+        hd->root_table = NULL;
+    }
+    spin_unlock(&hd->mapping_lock);
+}
+
+
+static void amd_iommu_domain_destroy(struct domain *d)
+{
+    deallocate_iommu_page_tables(d);
+    invalidate_all_iommu_pages(d);
+}
+
+static int amd_iommu_return_device(
+    struct domain *s, struct domain *t, u8 bus, u8 devfn)
+{
+    return reassign_device(s, t, bus, devfn);
+}
+
+static int amd_iommu_add_device(struct pci_dev *pdev)
+{
+    struct amd_iommu *iommu;
+    u16 bdf;
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    bdf = (pdev->bus << 8) | pdev->devfn;
+    iommu = (bdf < ivrs_bdf_entries) ?
+    find_iommu_for_device(pdev->bus, pdev->devfn) : NULL;
+
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("Fail to find iommu."
+                        " %x:%x.%x cannot be assigned to domain %d\n", 
+                        pdev->bus, PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn), pdev->domain->domain_id);
+        return -ENODEV;
+    }
+
+    amd_iommu_setup_domain_device(pdev->domain, iommu, bdf);
+    return 0;
+}
+
+static int amd_iommu_remove_device(struct pci_dev *pdev)
+{
+    struct amd_iommu *iommu;
+    u16 bdf;
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    bdf = (pdev->bus << 8) | pdev->devfn;
+    iommu = (bdf < ivrs_bdf_entries) ?
+    find_iommu_for_device(pdev->bus, pdev->devfn) : NULL;
+
+    if ( !iommu )
+    {
+        AMD_IOMMU_DEBUG("Fail to find iommu."
+                        " %x:%x.%x cannot be removed from domain %d\n", 
+                        pdev->bus, PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn), pdev->domain->domain_id);
+        return -ENODEV;
+    }
+
+    amd_iommu_disable_domain_device(pdev->domain, iommu, bdf);
+    return 0;
+}
+
+static int amd_iommu_group_id(u8 bus, u8 devfn)
+{
+    int rt;
+    int bdf = (bus << 8) | devfn;
+    rt = ( bdf < ivrs_bdf_entries ) ?
+      get_dma_requestor_id(bdf) :
+      bdf;
+    return rt;
+}
+
+struct iommu_ops amd_iommu_ops = {
+    .init = amd_iommu_domain_init,
+    .add_device = amd_iommu_add_device,
+    .remove_device = amd_iommu_remove_device,
+    .assign_device  = amd_iommu_assign_device,
+    .teardown = amd_iommu_domain_destroy,
+    .map_page = amd_iommu_map_page,
+    .unmap_page = amd_iommu_unmap_page,
+    .reassign_device = amd_iommu_return_device,
+    .get_device_group_id = amd_iommu_group_id,
+    .update_ire_from_apic = amd_iommu_ioapic_update_ire,
+    .update_ire_from_msi = amd_iommu_msi_msg_update_ire,
+};
diff -Naurp xen/drivers/passthrough/io.c xen-redhat/drivers/passthrough/io.c
--- xen/drivers/passthrough/io.c
+++ xen-redhat/drivers/passthrough/io.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com>
+ */
+
+#include <xen/event.h>
+#include <xen/iommu.h>
+#include <asm/hvm/irq.h>
+#include <asm/hvm/iommu.h>
+#include <xen/hvm/irq.h>
+
+static void pt_irq_time_out(void *data)
+{
+    struct hvm_mirq_dpci_mapping *irq_map = data;
+    unsigned int guest_gsi, machine_gsi = 0;
+    int vector;
+    struct hvm_irq_dpci *dpci = NULL;
+    struct dev_intx_gsi_link *digl;
+    uint32_t device, intx;
+
+    spin_lock(&irq_map->dom->event_lock);
+
+    dpci = domain_get_irq_dpci(irq_map->dom);
+    ASSERT(dpci);
+    list_for_each_entry ( digl, &irq_map->digl_list, list )
+    {
+        guest_gsi = digl->gsi;
+        machine_gsi = dpci->girq[guest_gsi].machine_gsi;
+        device = digl->device;
+        intx = digl->intx;
+        hvm_pci_intx_deassert(irq_map->dom, device, intx);
+    }
+
+    clear_bit(machine_gsi, dpci->dirq_mask);
+    vector = domain_irq_to_vector(irq_map->dom, machine_gsi);
+    dpci->mirq[machine_gsi].pending = 0;
+    spin_unlock(&irq_map->dom->event_lock);
+    pirq_guest_eoi(irq_map->dom, machine_gsi);
+}
+
+int pt_irq_create_bind_vtd(
+    struct domain *d, xen_domctl_bind_pt_irq_t *pt_irq_bind)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
+    uint32_t machine_gsi, guest_gsi;
+    uint32_t device, intx, link;
+    struct dev_intx_gsi_link *digl;
+    int rc, pirq = pt_irq_bind->machine_irq;
+
+    if ( pirq < 0 || pirq >= NR_IRQS )
+        return -EINVAL;
+
+    spin_lock(&d->event_lock);
+
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+    if ( hvm_irq_dpci == NULL )
+    {
+        hvm_irq_dpci = xmalloc(struct hvm_irq_dpci);
+        if ( hvm_irq_dpci == NULL )
+        {
+            spin_unlock(&d->event_lock);
+            return -ENOMEM;
+        }
+        memset(hvm_irq_dpci, 0, sizeof(*hvm_irq_dpci));
+        for ( int i = 0; i < NR_IRQS; i++ )
+            INIT_LIST_HEAD(&hvm_irq_dpci->mirq[i].digl_list);
+    }
+
+    if ( domain_set_irq_dpci(d, hvm_irq_dpci) == 0 )
+    {
+        xfree(hvm_irq_dpci);
+        spin_unlock(&d->event_lock);
+        return -EINVAL;
+    }
+
+    if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI )
+    {
+#ifdef SUPPORT_MSI_REMAPPING
+        if ( !test_and_set_bit(pirq, hvm_irq_dpci->mapping))
+        {
+            set_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags);
+            hvm_irq_dpci->mirq[pirq].gmsi.gvec = pt_irq_bind->u.msi.gvec;
+            hvm_irq_dpci->mirq[pirq].gmsi.gflags = pt_irq_bind->u.msi.gflags;
+            hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = pirq;
+            /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/
+            rc = pirq_guest_bind(d->vcpu[0], pirq, 0);
+            if ( rc == 0 )
+            {
+                rc = msixtbl_pt_register(d, pirq);
+                if ( unlikely(rc) )
+                    pirq_guest_unbind(d, pirq);
+            }
+            if ( unlikely(rc) )
+            {
+                hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = 0;
+                hvm_irq_dpci->mirq[pirq].gmsi.gflags = 0;
+                hvm_irq_dpci->mirq[pirq].gmsi.gvec = 0;
+                clear_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags);
+                clear_bit(pirq, hvm_irq_dpci->mapping);
+                spin_unlock(&d->event_lock);
+                return rc;
+            }
+        }
+        else
+        {
+            uint32_t old_gvec;
+
+            if ( !test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags) )
+            {
+	            spin_unlock(&d->event_lock);
+        	    return -EBUSY;
+            }
+
+            /* if pirq is already mapped as vmsi, update the guest data/addr */
+            old_gvec = hvm_irq_dpci->mirq[pirq].gmsi.gvec;
+            hvm_irq_dpci->msi_gvec_pirq[old_gvec] = 0;
+            hvm_irq_dpci->mirq[pirq].gmsi.gvec = pt_irq_bind->u.msi.gvec;
+            hvm_irq_dpci->mirq[pirq].gmsi.gflags = pt_irq_bind->u.msi.gflags;
+            hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = pirq;
+        }
+#else
+        return -ENOSYS;
+#endif
+    }
+    else
+    {
+        machine_gsi = pt_irq_bind->machine_irq;
+        device = pt_irq_bind->u.pci.device;
+        intx = pt_irq_bind->u.pci.intx;
+        guest_gsi = hvm_pci_intx_gsi(device, intx);
+        link = hvm_pci_intx_link(device, intx);
+        hvm_irq_dpci->link_cnt[link]++;
+
+        digl = xmalloc(struct dev_intx_gsi_link);
+        if ( !digl )
+        {
+            spin_unlock(&d->event_lock);
+            return -ENOMEM;
+        }
+
+        digl->device = device;
+        digl->intx = intx;
+        digl->gsi = guest_gsi;
+        digl->link = link;
+        list_add_tail(&digl->list,
+                      &hvm_irq_dpci->mirq[machine_gsi].digl_list);
+
+        hvm_irq_dpci->girq[guest_gsi].valid = 1;
+        hvm_irq_dpci->girq[guest_gsi].device = device;
+        hvm_irq_dpci->girq[guest_gsi].intx = intx;
+        hvm_irq_dpci->girq[guest_gsi].machine_gsi = machine_gsi;
+
+        /* Bind the same mirq once in the same domain */
+        if ( !test_and_set_bit(machine_gsi, hvm_irq_dpci->mapping))
+        {
+            unsigned int vector = domain_irq_to_vector(d, machine_gsi);
+
+            hvm_irq_dpci->mirq[machine_gsi].dom = d;
+
+            /* Init timer before binding */
+            init_timer(&hvm_irq_dpci->hvm_timer[vector],
+                       pt_irq_time_out, &hvm_irq_dpci->mirq[machine_gsi], 0);
+            /* Deal with gsi for legacy devices */
+            rc = pirq_guest_bind(d->vcpu[0], machine_gsi, BIND_PIRQ__WILL_SHARE);
+            if ( unlikely(rc) )
+            {
+                kill_timer(&hvm_irq_dpci->hvm_timer[vector]);
+                hvm_irq_dpci->mirq[machine_gsi].dom = NULL;
+                clear_bit(machine_gsi, hvm_irq_dpci->mapping);
+                hvm_irq_dpci->girq[guest_gsi].machine_gsi = 0;
+                hvm_irq_dpci->girq[guest_gsi].intx = 0;
+                hvm_irq_dpci->girq[guest_gsi].device = 0;
+                hvm_irq_dpci->girq[guest_gsi].valid = 0;
+                list_del(&digl->list);
+                hvm_irq_dpci->link_cnt[link]--;
+                spin_unlock(&d->event_lock);
+                xfree(digl);
+                return rc;
+            }
+        }
+
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "VT-d irq bind: m_irq = %x device = %x intx = %x\n",
+                 machine_gsi, device, intx);
+    }
+    spin_unlock(&d->event_lock);
+    return 0;
+}
+
+int pt_irq_destroy_bind_vtd(
+    struct domain *d, xen_domctl_bind_pt_irq_t *pt_irq_bind)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
+    uint32_t machine_gsi, guest_gsi;
+    uint32_t device, intx, link;
+    struct list_head *digl_list, *tmp;
+    struct dev_intx_gsi_link *digl;
+
+    machine_gsi = pt_irq_bind->machine_irq;
+    device = pt_irq_bind->u.pci.device;
+    intx = pt_irq_bind->u.pci.intx;
+    guest_gsi = hvm_pci_intx_gsi(device, intx);
+    link = hvm_pci_intx_link(device, intx);
+
+    gdprintk(XENLOG_INFO,
+             "pt_irq_destroy_bind_vtd: machine_gsi=%d "
+             "guest_gsi=%d, device=%d, intx=%d.\n",
+             machine_gsi, guest_gsi, device, intx);
+    spin_lock(&d->event_lock);
+
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+
+    if ( hvm_irq_dpci == NULL )
+    {
+        spin_unlock(&d->event_lock);
+        return -EINVAL;
+    }
+
+    hvm_irq_dpci->link_cnt[link]--;
+    memset(&hvm_irq_dpci->girq[guest_gsi], 0,
+           sizeof(struct hvm_girq_dpci_mapping));
+
+    /* clear the mirq info */
+    if ( test_bit(machine_gsi, hvm_irq_dpci->mapping))
+    {
+        list_for_each_safe ( digl_list, tmp,
+                &hvm_irq_dpci->mirq[machine_gsi].digl_list )
+        {
+            digl = list_entry(digl_list,
+                    struct dev_intx_gsi_link, list);
+            if ( digl->device == device &&
+                 digl->intx   == intx &&
+                 digl->link   == link &&
+                 digl->gsi    == guest_gsi )
+            {
+                list_del(&digl->list);
+                xfree(digl);
+            }
+        }
+
+        if ( list_empty(&hvm_irq_dpci->mirq[machine_gsi].digl_list) )
+        {
+            pirq_guest_unbind(d, machine_gsi);
+#ifdef SUPPORT_MSI_REMAPPING
+            msixtbl_pt_unregister(d, machine_gsi);
+#endif
+            kill_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, machine_gsi)]);
+            hvm_irq_dpci->mirq[machine_gsi].dom   = NULL;
+            hvm_irq_dpci->mirq[machine_gsi].flags = 0;
+            clear_bit(machine_gsi, hvm_irq_dpci->mapping);
+        }
+    }
+    spin_unlock(&d->event_lock);
+    gdprintk(XENLOG_INFO,
+             "XEN_DOMCTL_irq_unmapping: m_irq = %x device = %x intx = %x\n",
+             machine_gsi, device, intx);
+
+    return 0;
+}
+
+int hvm_do_IRQ_dpci(struct domain *d, unsigned int mirq)
+{
+    struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d);
+
+    ASSERT(spin_is_locked(&irq_desc[domain_irq_to_vector(d, mirq)].lock));
+    if ( !iommu_enabled || (d == dom0) || !dpci ||
+         !test_bit(mirq, dpci->mapping))
+        return 0;
+
+    /*
+     * Set a timer here to avoid situations where the IRQ line is shared, and
+     * the device belonging to the pass-through guest is not yet active. In
+     * this case the guest may not pick up the interrupt (e.g., masked at the
+     * PIC) and we need to detect that.
+     */
+    set_bit(mirq, dpci->dirq_mask);
+    if ( !test_bit(_HVM_IRQ_DPCI_MSI, &dpci->mirq[mirq].flags) )
+        set_timer(&dpci->hvm_timer[domain_irq_to_vector(d, mirq)],
+                  NOW() + PT_IRQ_TIME_OUT);
+    vcpu_kick(d->vcpu[0]);
+
+    return 1;
+}
+
+#ifdef SUPPORT_MSI_REMAPPING
+void hvm_dpci_msi_eoi(struct domain *d, int vector)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
+    irq_desc_t *desc;
+    int pirq;
+
+    if ( !iommu_enabled || (hvm_irq_dpci == NULL) )
+       return;
+
+    spin_lock(&d->event_lock);
+    pirq = hvm_irq_dpci->msi_gvec_pirq[vector];
+
+    if ( ( pirq >= 0 ) && (pirq < NR_IRQS) &&
+          test_bit(pirq, hvm_irq_dpci->mapping) &&
+         (test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags)))
+     {
+         BUG_ON(!local_irq_is_enabled());
+         desc = domain_spin_lock_irq_desc(d, pirq, NULL);
+         if (!desc)
+         {
+            spin_unlock(&d->event_lock);
+            return;
+         }
+
+         desc->status &= ~IRQ_INPROGRESS;
+         spin_unlock_irq(&desc->lock);
+
+         pirq_guest_eoi(d, pirq);
+     }
+
+    spin_unlock(&d->event_lock);
+}
+
+extern int vmsi_deliver(struct domain *d, int pirq);
+static int hvm_pci_msi_assert(struct domain *d, int pirq)
+{
+    return vmsi_deliver(d, pirq);
+}
+#endif
+
+void hvm_dirq_assist(struct vcpu *v)
+{
+    unsigned int irq;
+    uint32_t device, intx;
+    struct domain *d = v->domain;
+    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
+    struct dev_intx_gsi_link *digl;
+
+    if ( !iommu_enabled || (v->vcpu_id != 0) || (hvm_irq_dpci == NULL) )
+        return;
+
+    for ( irq = find_first_bit(hvm_irq_dpci->dirq_mask, NR_IRQS);
+          irq < NR_IRQS;
+          irq = find_next_bit(hvm_irq_dpci->dirq_mask, NR_IRQS, irq + 1) )
+    {
+        if ( !test_and_clear_bit(irq, &hvm_irq_dpci->dirq_mask) )
+            continue;
+
+        spin_lock(&d->event_lock);
+#ifdef SUPPORT_MSI_REMAPPING
+        if ( test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[irq].flags) )
+        {
+            hvm_pci_msi_assert(d, irq);
+            spin_unlock(&d->event_lock);
+            continue;
+        }
+#endif
+        stop_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)]);
+
+        list_for_each_entry ( digl, &hvm_irq_dpci->mirq[irq].digl_list, list )
+        {
+            device = digl->device;
+            intx = digl->intx;
+            hvm_pci_intx_assert(d, device, intx);
+            hvm_irq_dpci->mirq[irq].pending++;
+        }
+
+        /*
+         * Set a timer to see if the guest can finish the interrupt or not. For
+         * example, the guest OS may unmask the PIC during boot, before the
+         * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
+         * guest will never deal with the irq, then the physical interrupt line
+         * will never be deasserted.
+         */
+        set_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)],
+                  NOW() + PT_IRQ_TIME_OUT);
+        spin_unlock(&d->event_lock);
+    }
+}
+
+void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
+                  union vioapic_redir_entry *ent)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
+    uint32_t device, intx, machine_gsi;
+
+    if ( !iommu_enabled)
+        return;
+
+    if ( guest_gsi < NR_ISAIRQS )
+    {
+        hvm_dpci_isairq_eoi(d, guest_gsi);
+        return;
+    }
+
+    spin_lock(&d->event_lock);
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+
+    if((hvm_irq_dpci == NULL) ||
+         (guest_gsi >= NR_ISAIRQS &&
+          !hvm_irq_dpci->girq[guest_gsi].valid) )
+    {
+        spin_unlock(&d->event_lock);
+        return;
+    }
+
+    device = hvm_irq_dpci->girq[guest_gsi].device;
+    intx = hvm_irq_dpci->girq[guest_gsi].intx;
+    hvm_pci_intx_deassert(d, device, intx);
+
+    machine_gsi = hvm_irq_dpci->girq[guest_gsi].machine_gsi;
+    if ( --hvm_irq_dpci->mirq[machine_gsi].pending == 0 )
+    {
+        if ( (ent == NULL) || !ent->fields.mask )
+        {
+            /*
+             * No need to get vector lock for timer
+             * since interrupt is still not EOIed
+             */
+            stop_timer(&hvm_irq_dpci->hvm_timer[
+                domain_irq_to_vector(d, machine_gsi)]);
+            pirq_guest_eoi(d, machine_gsi);
+        }
+    }
+    spin_unlock(&d->event_lock);
+}
diff -Naurp xen/drivers/passthrough/iommu.c xen-redhat/drivers/passthrough/iommu.c
--- xen/drivers/passthrough/iommu.c
+++ xen-redhat/drivers/passthrough/iommu.c
@@ -0,0 +1,282 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/sched.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <xen/paging.h>
+#include <xen/guest_access.h>
+
+static void parse_iommu_param(char *s);
+int intel_vtd_setup(void);
+
+/*
+ * The 'iommu' parameter enables the IOMMU.  Optional comma separated
+ * value may contain:
+ *
+ *   off|no|false|disable       Disable IOMMU (default)
+ *   force|required             Don't boot unless IOMMU is enabled
+ *   passthrough                Bypass VT-d translation for Dom0
+ *   snoop                      Utilize the snoop control for IOMMU (default)
+ *   no-snoop                   Dont utilize the snoop control for IOMMU
+ *   amd-iommu-debug            Turn on debug info for AMD IOMMU
+ */
+custom_param("iommu", parse_iommu_param);
+int iommu_enabled = 0;
+int force_iommu = 0;
+int iommu_passthrough = 0;
+int iommu_snoop = 0;
+int iommu_intremap = 0;
+int amd_iommu_debug=0;
+int amd_iommu_perdev_intremap = 0;
+
+static void __init parse_iommu_param(char *s)
+{
+    char *ss;
+    iommu_enabled = 1;
+    iommu_snoop = 1;
+    iommu_intremap = 1;
+    amd_iommu_debug = 0;
+    amd_iommu_perdev_intremap = 0;
+
+    do {
+        ss = strchr(s, ',');
+        if ( ss )
+            *ss = '\0';
+
+        if ( !strcmp(s, "off") || !strcmp(s, "no") || !strcmp(s, "false") ||
+             !strcmp(s, "0") || !strcmp(s, "disable") )
+            iommu_enabled = 0;
+        else if ( !strcmp(s, "force") || !strcmp(s, "required") )
+            force_iommu = 1;
+        else if ( !strcmp(s, "passthrough") )
+            iommu_passthrough = 1;
+        else if ( !strcmp(s, "snoop") )
+            iommu_snoop = 1;
+        else if ( !strcmp(s, "no-snoop") )
+            iommu_snoop = 0;
+        else if ( !strcmp(s, "no-intremap") )
+            iommu_intremap = 0;
+        else if ( !strcmp(s, "amd-iommu-debug") )
+            amd_iommu_debug = 1;
+        else if ( !strcmp(s, "amd-iommu-perdev-intremap") )
+            amd_iommu_perdev_intremap = 1;
+
+        s = ss + 1;
+    } while ( ss );
+}
+
+int iommu_domain_init(struct domain *domain)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+
+    spin_lock_init(&hd->mapping_lock);
+    INIT_LIST_HEAD(&hd->g2m_ioport_list);
+
+    if ( !iommu_enabled )
+        return 0;
+
+    hd->platform_ops = iommu_get_ops();
+    return hd->platform_ops->init(domain);
+}
+
+int iommu_add_device(struct pci_dev *pdev)
+{
+    struct hvm_iommu *hd;
+
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    hd = domain_hvm_iommu(pdev->domain);
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->add_device(pdev);
+}
+
+int iommu_remove_device(struct pci_dev *pdev)
+{
+    struct hvm_iommu *hd;
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    hd = domain_hvm_iommu(pdev->domain);
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->remove_device(pdev);
+}
+
+int assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    int rc = 0;
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    spin_lock(&pcidevs_lock);
+    rc = hd->platform_ops->assign_device(d, bus, devfn);
+    spin_unlock(&pcidevs_lock);
+    return rc;
+}
+
+void iommu_domain_destroy(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    struct list_head *ioport_list, *tmp;
+    struct g2m_ioport *ioport;
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return;
+
+    if ( hd )
+    {
+        list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
+        {
+            ioport = list_entry(ioport_list, struct g2m_ioport, list);
+            list_del(&ioport->list);
+            xfree(ioport);
+        }
+    }
+
+    return hd->platform_ops->teardown(d);
+}
+
+int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->map_page(d, gfn, mfn);
+}
+
+int iommu_unmap_page(struct domain *d, unsigned long gfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->unmap_page(d, gfn);
+}
+
+/* caller should hold the pcidevs_lock */
+int deassign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct pci_dev *pdev = NULL;
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(bus, devfn);
+    if (!pdev)
+        return -ENODEV;
+
+    if (pdev->domain != d)
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: deassign a device not owned\n");
+        return -EINVAL;
+    }
+
+    return hd->platform_ops->reassign_device(d, dom0, bus, devfn);
+}
+
+int iommu_setup(void)
+{
+    int rc = -ENODEV;
+
+    if ( !iommu_enabled )
+        goto out;
+
+    rc = iommu_hardware_setup();
+
+    iommu_enabled = (rc == 0);
+
+ out:
+    if ( force_iommu && !iommu_enabled )
+        panic("IOMMU setup failed, crash Xen for security purpose!\n");
+
+    printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
+    return rc;
+}
+
+int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn,
+    XEN_GUEST_HANDLE_64(uint32_t) buf, int max_sdevs)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct pci_dev *pdev;
+    int group_id, sdev_id;
+    u32 bdf;
+    int i = 0;
+    struct iommu_ops *ops = hd->platform_ops;
+
+    if ( !iommu_enabled || !ops || !ops->get_device_group_id )
+        return 0;
+
+    group_id = ops->get_device_group_id(bus, devfn);
+
+    spin_lock(&pcidevs_lock);
+    for_each_pdev( d, pdev )
+    {
+        if ( (pdev->bus == bus) && (pdev->devfn == devfn) )
+            continue;
+
+        sdev_id = ops->get_device_group_id(pdev->bus, pdev->devfn);
+        if ( (sdev_id == group_id) && (i < max_sdevs) )
+        {
+            bdf = 0;
+            bdf |= (pdev->bus & 0xff) << 16;
+            bdf |= (pdev->devfn & 0xff) << 8;
+            if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
+            {
+                spin_unlock(&pcidevs_lock);
+                return -1;
+            }
+            i++;
+        }
+    }
+    spin_unlock(&pcidevs_lock);
+
+    return i;
+}
+
+void iommu_update_ire_from_apic(
+    unsigned int apic, unsigned int reg, unsigned int value)
+{
+    struct iommu_ops *ops = iommu_get_ops();
+    ops->update_ire_from_apic(apic, reg, value);
+}
+void iommu_update_ire_from_msi(
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    struct iommu_ops *ops = iommu_get_ops();
+    ops->update_ire_from_msi(msi_desc, msg);
+}
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -Naurp xen/drivers/passthrough/Makefile xen-redhat/drivers/passthrough/Makefile
--- xen/drivers/passthrough/Makefile
+++ xen-redhat/drivers/passthrough/Makefile
@@ -0,0 +1,8 @@
+subdir-$(x86_32) += vtd
+subdir-$(x86_64) += vtd
+subdir-$(x86_32) += amd
+subdir-$(x86_64) += amd
+
+obj-y += iommu.o
+obj-y += pci.o
+obj-y += io.o
diff -Naurp xen/drivers/passthrough/pci.c xen-redhat/drivers/passthrough/pci.c
--- xen/drivers/passthrough/pci.c
+++ xen-redhat/drivers/passthrough/pci.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2008,  Netronome Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/sched.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/list.h>
+#include <xen/prefetch.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <asm/hvm/irq.h>
+#include <xen/delay.h>
+#include <xen/keyhandler.h>
+
+
+LIST_HEAD(alldevs_list);
+spinlock_t pcidevs_lock = SPIN_LOCK_UNLOCKED;
+
+struct pci_dev *alloc_pdev(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+        if ( pdev->bus == bus && pdev->devfn == devfn )
+            return pdev;
+
+    pdev = xmalloc(struct pci_dev);
+    if ( !pdev )
+        return NULL;
+    memset(pdev, 0, sizeof(struct pci_dev));
+
+    *((u8*) &pdev->bus) = bus;
+    *((u8*) &pdev->devfn) = devfn;
+    pdev->domain = NULL;
+    INIT_LIST_HEAD(&pdev->msi_list);
+    list_add(&pdev->alldevs_list, &alldevs_list);
+    spin_lock_init(&pdev->msix_table_lock);
+
+    return pdev;
+}
+
+void free_pdev(struct pci_dev *pdev)
+{
+    list_del(&pdev->alldevs_list);
+    xfree(pdev);
+}
+
+struct pci_dev *pci_get_pdev(int bus, int devfn)
+{
+    struct pci_dev *pdev = NULL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+        if ( (pdev->bus == bus || bus == -1) &&
+             (pdev->devfn == devfn || devfn == -1) )
+        {
+            return pdev;
+        }
+
+    return NULL;
+}
+
+struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn)
+{
+    struct pci_dev *pdev = NULL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+         if ( (pdev->bus == bus || bus == -1) &&
+              (pdev->devfn == devfn || devfn == -1) &&
+              (pdev->domain == d) )
+         {
+             return pdev;
+         }
+
+    return NULL;
+}
+
+int pci_add_device(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+    int ret = -ENOMEM;
+
+    spin_lock(&pcidevs_lock);
+    pdev = alloc_pdev(bus, devfn);
+    if ( !pdev )
+        goto out;
+
+    ret = 0;
+    if ( !pdev->domain )
+    {
+        pdev->domain = dom0;
+        ret = iommu_add_device(pdev);
+        if ( ret )
+            goto out;
+
+        list_add(&pdev->domain_list, &dom0->arch.pdev_list);
+    }
+
+out:
+    spin_unlock(&pcidevs_lock);
+    printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus,
+           PCI_SLOT(devfn), PCI_FUNC(devfn));
+    return ret;
+}
+
+int pci_remove_device(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+    int ret = -ENODEV;;
+
+    spin_lock(&pcidevs_lock);
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+        if ( pdev->bus == bus && pdev->devfn == devfn )
+        {
+            ret = iommu_remove_device(pdev);
+            if ( pdev->domain )
+                list_del(&pdev->domain_list);
+            pci_cleanup_msi(pdev);
+            free_pdev(pdev);
+            printk(XENLOG_DEBUG "PCI remove device %02x:%02x.%x\n", bus,
+                   PCI_SLOT(devfn), PCI_FUNC(devfn));
+            break;
+        }
+
+    spin_unlock(&pcidevs_lock);
+    return ret;
+}
+
+int pci_set_device_msixtbl(u8 bus, u8 devfn, u64 gtable)
+{
+    struct pci_dev *pdev;
+
+    pdev = pci_get_pdev(bus, devfn);
+
+    if ( !pdev )
+        return -ENODEV;
+
+    pdev->msix_table = gtable;
+
+    return 0;
+}
+
+static void pci_clean_dpci_irqs(struct domain *d)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
+    uint32_t i;
+    struct list_head *digl_list, *tmp;
+    struct dev_intx_gsi_link *digl;
+
+    if ( !iommu_enabled )
+        return;
+
+    spin_lock(&d->event_lock);
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+    if ( hvm_irq_dpci != NULL )
+    {
+        for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS);
+              i < NR_IRQS;
+              i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) )
+        {
+            pirq_guest_unbind(d, i);
+            kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]);
+
+            list_for_each_safe ( digl_list, tmp,
+                                 &hvm_irq_dpci->mirq[i].digl_list )
+            {
+                digl = list_entry(digl_list,
+                                  struct dev_intx_gsi_link, list);
+                list_del(&digl->list);
+                xfree(digl);
+            }
+        }
+
+        d->arch.hvm_domain.irq.dpci = NULL;
+        xfree(hvm_irq_dpci);
+    }
+    spin_unlock(&d->event_lock);
+}
+
+void pci_release_devices(struct domain *d)
+{
+    struct pci_dev *pdev;
+    u8 bus, devfn;
+
+    spin_lock(&pcidevs_lock);
+    pci_clean_dpci_irqs(d);
+    while ( (pdev = pci_get_pdev_by_domain(d, -1, -1)) )
+    {
+        pci_cleanup_msi(pdev);
+        bus = pdev->bus; devfn = pdev->devfn;
+        deassign_device(d, bus, devfn);
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -Naurp xen/drivers/passthrough/vtd/dmar.c xen-redhat/drivers/passthrough/vtd/dmar.c
--- xen/drivers/passthrough/vtd/dmar.c
+++ xen-redhat/drivers/passthrough/vtd/dmar.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
+ */
+
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/kernel.h>
+#include <xen/acpi.h>
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <asm/string.h>
+#include "dmar.h"
+#include "iommu.h"
+
+int vtd_enabled = 1;
+
+#undef PREFIX
+#define PREFIX VTDPREFIX "ACPI DMAR:"
+#define DEBUG
+
+#define MIN_SCOPE_LEN (sizeof(struct acpi_pci_path) + \
+                       sizeof(struct acpi_dev_scope))
+
+LIST_HEAD(acpi_drhd_units);
+LIST_HEAD(acpi_rmrr_units);
+LIST_HEAD(acpi_atsr_units);
+
+u8 dmar_host_address_width;
+
+void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus)
+{
+    sub_bus &= 0xff;
+    if (sec_bus > sub_bus)
+        return;
+
+    while ( sec_bus <= sub_bus )
+        set_bit(sec_bus++, scope->buses);
+}
+
+void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus)
+{
+    sub_bus &= 0xff;
+    if (sec_bus > sub_bus)
+        return;
+
+    while ( sec_bus <= sub_bus )
+        clear_bit(sec_bus++, scope->buses);
+}
+
+static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd)
+{
+    /*
+     * add INCLUDE_ALL at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if ( drhd->include_all )
+        list_add_tail(&drhd->list, &acpi_drhd_units);
+    else
+        list_add(&drhd->list, &acpi_drhd_units);
+    return 0;
+}
+
+static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr)
+{
+    list_add(&rmrr->list, &acpi_rmrr_units);
+    return 0;
+}
+
+static void __init disable_all_dmar_units(void)
+{
+    struct acpi_drhd_unit *drhd, *_drhd;
+    struct acpi_rmrr_unit *rmrr, *_rmrr;
+    struct acpi_atsr_unit *atsr, *_atsr;
+
+    list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list )
+    {
+        list_del(&drhd->list);
+        xfree(drhd);
+    }
+    list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list )
+    {
+        list_del(&rmrr->list);
+        xfree(rmrr);
+    }
+    list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list )
+    {
+        list_del(&atsr->list);
+        xfree(atsr);
+    }
+}
+
+static int acpi_ioapic_device_match(
+    struct list_head *ioapic_list, unsigned int apic_id)
+{
+    struct acpi_ioapic_unit *ioapic;
+    list_for_each_entry( ioapic, ioapic_list, list ) {
+        if (ioapic->apic_id == apic_id)
+            return 1;
+    }
+    return 0;
+}
+
+struct acpi_drhd_unit * ioapic_to_drhd(unsigned int apic_id)
+{
+    struct acpi_drhd_unit *drhd;
+    list_for_each_entry( drhd, &acpi_drhd_units, list )
+        if ( acpi_ioapic_device_match(&drhd->ioapic_list, apic_id) )
+            return drhd;
+    return NULL;
+}
+
+struct iommu * ioapic_to_iommu(unsigned int apic_id)
+{
+    struct acpi_drhd_unit *drhd;
+
+    list_for_each_entry( drhd, &acpi_drhd_units, list )
+        if ( acpi_ioapic_device_match(&drhd->ioapic_list, apic_id) )
+            return drhd->iommu;
+    return NULL;
+}
+
+static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr)
+{
+    /*
+     * add ALL_PORTS at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if ( atsr->all_ports )
+        list_add_tail(&atsr->list, &acpi_atsr_units);
+    else
+        list_add(&atsr->list, &acpi_atsr_units);
+    return 0;
+}
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct acpi_drhd_unit *found = NULL, *include_all = NULL;
+    int i;
+
+    list_for_each_entry ( drhd, &acpi_drhd_units, list )
+    {
+        for (i = 0; i < drhd->scope.devices_cnt; i++)
+            if ( drhd->scope.devices[i] == PCI_BDF2(bus, devfn) )
+                return drhd;
+
+        if ( test_bit(bus, drhd->scope.buses) )
+            found = drhd;
+
+        if ( drhd->include_all )
+            include_all = drhd;
+    }
+
+    return found ? found : include_all;
+}
+
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn)
+{
+    struct acpi_atsr_unit *atsr;
+    struct acpi_atsr_unit *found = NULL, *include_all = NULL;
+    int i;
+
+    list_for_each_entry ( atsr, &acpi_atsr_units, list )
+    {
+        for (i = 0; i < atsr->scope.devices_cnt; i++)
+            if ( atsr->scope.devices[i] == PCI_BDF2(bus, devfn) )
+                return atsr;
+
+        if ( test_bit(bus, atsr->scope.buses) )
+            found = atsr;
+
+        if ( atsr->all_ports )
+            include_all = atsr;
+    }
+
+    return found ? found : include_all;
+}
+
+/*
+ * Count number of devices in device scope.  Do not include PCI sub
+ * hierarchies.
+ */
+static int scope_device_count(void *start, void *end)
+{
+    struct acpi_dev_scope *scope;
+    int count = 0;
+
+    while ( start < end )
+    {
+        scope = start;
+        if ( (scope->length < MIN_SCOPE_LEN) ||
+             (scope->dev_type >= ACPI_DEV_ENTRY_COUNT) )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX, "Invalid device scope.\n");
+            return -EINVAL;
+        }
+
+        if ( scope->dev_type == ACPI_DEV_ENDPOINT ||
+             scope->dev_type == ACPI_DEV_IOAPIC ||
+             scope->dev_type == ACPI_DEV_MSI_HPET )
+            count++;
+
+        start += scope->length;
+    }
+
+    return count;
+}
+
+
+static int __init acpi_parse_dev_scope(void *start, void *end,
+                                       void *acpi_entry, int type)
+{
+    struct dmar_scope *scope = acpi_entry;
+    struct acpi_ioapic_unit *acpi_ioapic_unit;
+    struct acpi_dev_scope *acpi_scope;
+    u16 bus, sub_bus, sec_bus;
+    struct acpi_pci_path *path;
+    int depth, cnt, didx = 0;
+
+    if ( (cnt = scope_device_count(start, end)) < 0 )
+        return cnt;
+
+    scope->devices_cnt = cnt;
+    if ( cnt > 0 )
+    {
+        scope->devices = xmalloc_array(u16, cnt);
+        if ( !scope->devices )
+            return -ENOMEM;
+        memset(scope->devices, 0, sizeof(u16) * cnt);
+    }
+
+    while ( start < end )
+    {
+        acpi_scope = start;
+        path = (struct acpi_pci_path *)(acpi_scope + 1);
+        depth = (acpi_scope->length - sizeof(struct acpi_dev_scope))
+		    / sizeof(struct acpi_pci_path);
+        bus = acpi_scope->start_bus;
+
+        while ( --depth > 0 )
+        {
+            bus = pci_conf_read8(bus, path->dev, path->fn, PCI_SECONDARY_BUS);
+            path++;
+        }
+
+        switch ( acpi_scope->dev_type )
+        {
+        case ACPI_DEV_P2PBRIDGE:
+            sec_bus = pci_conf_read8(
+                bus, path->dev, path->fn, PCI_SECONDARY_BUS);
+            sub_bus = pci_conf_read8(
+                bus, path->dev, path->fn, PCI_SUBORDINATE_BUS);
+            dprintk(XENLOG_INFO VTDPREFIX,
+                    "found bridge: bdf = %x:%x.%x  sec = %x  sub = %x\n",
+                    bus, path->dev, path->fn, sec_bus, sub_bus);
+
+            dmar_scope_add_buses(scope, sec_bus, sub_bus);
+            break;
+
+        case ACPI_DEV_MSI_HPET:
+            dprintk(XENLOG_INFO VTDPREFIX, "found MSI HPET: bdf = %x:%x.%x\n",
+                    bus, path->dev, path->fn);
+            scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
+            break;
+
+        case ACPI_DEV_ENDPOINT:
+            dprintk(XENLOG_INFO VTDPREFIX, "found endpoint: bdf = %x:%x.%x\n",
+                    bus, path->dev, path->fn);
+            scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
+            break;
+
+        case ACPI_DEV_IOAPIC:
+            dprintk(XENLOG_INFO VTDPREFIX, "found IOAPIC: bdf = %x:%x.%x\n",
+                    bus, path->dev, path->fn);
+
+            if ( type == DMAR_TYPE )
+            {
+                struct acpi_drhd_unit *drhd = acpi_entry;
+                acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit);
+                if ( !acpi_ioapic_unit )
+                    return -ENOMEM;
+                acpi_ioapic_unit->apic_id = acpi_scope->enum_id;
+                acpi_ioapic_unit->ioapic.bdf.bus = bus;
+                acpi_ioapic_unit->ioapic.bdf.dev = path->dev;
+                acpi_ioapic_unit->ioapic.bdf.func = path->fn;
+                list_add(&acpi_ioapic_unit->list, &drhd->ioapic_list);
+            }
+
+            scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
+            break;
+        }
+
+        start += acpi_scope->length;
+   }
+
+    return 0;
+}
+
+static int __init
+acpi_parse_one_drhd(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_drhd * drhd = (struct acpi_table_drhd *)header;
+    void *dev_scope_start, *dev_scope_end;
+    struct acpi_drhd_unit *dmaru;
+    int ret = 0;
+    static int include_all = 0;
+
+    dmaru = xmalloc(struct acpi_drhd_unit);
+    if ( !dmaru )
+        return -ENOMEM;
+    memset(dmaru, 0, sizeof(struct acpi_drhd_unit));
+
+    dmaru->address = drhd->address;
+    dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
+    INIT_LIST_HEAD(&dmaru->ioapic_list);
+    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n",
+            dmaru->address);
+
+    dev_scope_start = (void *)(drhd + 1);
+    dev_scope_end = ((void *)drhd) + header->length;
+    ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+                               dmaru, DMAR_TYPE);
+
+    if ( dmaru->include_all )
+    {
+        dprintk(XENLOG_INFO VTDPREFIX, "found INCLUDE_ALL\n");
+        /* Only allow one INCLUDE_ALL */
+        if ( include_all )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Only one INCLUDE_ALL device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        include_all = 1;
+    }
+
+    if ( ret )
+        xfree(dmaru);
+    else
+        acpi_register_drhd_unit(dmaru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_rmrr *rmrr = (struct acpi_table_rmrr *)header;
+    struct acpi_rmrr_unit *rmrru;
+    void *dev_scope_start, *dev_scope_end;
+    int ret = 0;
+
+    if ( rmrr->base_address >= rmrr->end_address )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n",
+                rmrr->base_address, rmrr->end_address);
+        return -EFAULT;
+    }
+
+    rmrru = xmalloc(struct acpi_rmrr_unit);
+    if ( !rmrru )
+        return -ENOMEM;
+    memset(rmrru, 0, sizeof(struct acpi_rmrr_unit));
+
+    rmrru->base_address = rmrr->base_address;
+    rmrru->end_address = rmrr->end_address;
+    dev_scope_start = (void *)(rmrr + 1);
+    dev_scope_end   = ((void *)rmrr) + header->length;
+    ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+                               rmrru, RMRR_TYPE);
+
+    if ( ret || (rmrru->scope.devices_cnt == 0) )
+        xfree(rmrru);
+    else
+        acpi_register_rmrr_unit(rmrru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_atsr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_atsr *atsr = (struct acpi_table_atsr *)header;
+    struct acpi_atsr_unit *atsru;
+    int ret = 0;
+    static int all_ports;
+    void *dev_scope_start, *dev_scope_end;
+
+    atsru = xmalloc(struct acpi_atsr_unit);
+    if ( !atsru )
+        return -ENOMEM;
+    memset(atsru, 0, sizeof(struct acpi_atsr_unit));
+
+    atsru->all_ports = atsr->flags & 1; /* BIT0: ALL_PORTS */
+    if ( !atsru->all_ports )
+    {
+        dev_scope_start = (void *)(atsr + 1);
+        dev_scope_end   = ((void *)atsr) + header->length;
+        ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+                                   atsru, ATSR_TYPE);
+    }
+    else
+    {
+        dprintk(XENLOG_INFO VTDPREFIX, "found ALL_PORTS\n");
+        /* Only allow one ALL_PORTS */
+        if ( all_ports )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Only one ALL_PORTS device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        all_ports = 1;
+    }
+
+    if ( ret )
+        xfree(atsr);
+    else
+        acpi_register_atsr_unit(atsru);
+    return ret;
+}
+
+static int __init acpi_parse_dmar(unsigned long phys_addr,
+                                  unsigned long size)
+{
+    struct acpi_table_dmar *dmar;
+    struct acpi_dmar_entry_header *entry_header;
+    int ret = 0;
+
+    if ( !phys_addr || !size )
+        return -EINVAL;
+
+    dmar = (struct acpi_table_dmar *)__acpi_map_table(phys_addr, size);
+    if ( !dmar )
+    {
+        dprintk(XENLOG_WARNING VTDPREFIX, "Unable to map DMAR\n");
+        return -ENODEV;
+    }
+
+    if ( !dmar->haw )
+    {
+        dprintk(XENLOG_WARNING VTDPREFIX, "Zero: Invalid DMAR width\n");
+        if ( force_iommu )
+            panic("acpi_parse_dmar: Invalid DMAR width,"
+                  " crash Xen for security purpose!\n");
+        return -EINVAL;
+    }
+
+    dmar_host_address_width = dmar->haw + 1;
+    dprintk(XENLOG_INFO VTDPREFIX, "Host address width %d\n",
+            dmar_host_address_width);
+
+    entry_header = (struct acpi_dmar_entry_header *)(dmar + 1);
+    while ( ((unsigned long)entry_header) <
+            (((unsigned long)dmar) + size) )
+    {
+        switch ( entry_header->type )
+        {
+        case ACPI_DMAR_DRHD:
+            dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_DRHD\n");
+            ret = acpi_parse_one_drhd(entry_header);
+            break;
+        case ACPI_DMAR_RMRR:
+            dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_RMRR\n");
+            ret = acpi_parse_one_rmrr(entry_header);
+            break;
+        case ACPI_DMAR_ATSR:
+            dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_ATSR\n");
+            ret = acpi_parse_one_atsr(entry_header);
+            break;
+        default:
+	    /*
+	     *  Don't panic on an unknown table, just print a warning
+	     *    and continue
+	     */
+            dprintk(XENLOG_WARNING VTDPREFIX, "Unknown DMAR structure type\n");
+            break;
+        }
+        if ( ret )
+            break;
+
+        entry_header = ((void *)entry_header + entry_header->length);
+    }
+
+    /* Zap APCI DMAR signature to prevent dom0 using vt-d HW. */
+    dmar->header.signature[0] = '\0';
+
+    if ( ret )
+    {
+        if ( force_iommu )
+            panic("acpi_parse_dmar: Failed to parse ACPI DMAR,"
+                  " crash Xen for security purpose!\n");
+        else
+        {
+            printk(XENLOG_WARNING
+                   "Failed to parse ACPI DMAR.  Disabling VT-d.\n");
+            disable_all_dmar_units();
+        }
+    }
+
+    return ret;
+}
+
+int acpi_dmar_init(void)
+{
+    int rc;
+
+    rc = -ENODEV;
+    if ( force_iommu )
+        iommu_enabled = 1;
+
+    if ( !iommu_enabled )
+        goto fail;
+
+    acpi_table_parse(ACPI_DMAR, acpi_parse_dmar);
+
+    if ( list_empty(&acpi_drhd_units) )
+        goto fail;
+
+    printk("Intel VT-d has been enabled\n");
+
+    return 0;
+
+ fail:
+    if ( force_iommu )
+        panic("acpi_dmar_init: acpi_dmar_init failed,"
+              " crash Xen for security purpose!\n");
+
+    vtd_enabled = 0;
+    return rc;
+}
diff -Naurp xen/drivers/passthrough/vtd/dmar.h xen-redhat/drivers/passthrough/vtd/dmar.h
--- xen/drivers/passthrough/vtd/dmar.h
+++ xen-redhat/drivers/passthrough/vtd/dmar.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ */
+
+#ifndef _DMAR_H_
+#define _DMAR_H_
+
+#include <xen/list.h>
+#include <xen/iommu.h>
+
+extern u8 dmar_host_address_width;
+
+/* This one is for interrupt remapping */
+struct acpi_ioapic_unit {
+    struct list_head list;
+    int apic_id;
+    union {
+        u16 info;
+        struct {
+            u16 func: 3,
+                dev:  5,
+                bus:  8;
+        }bdf;
+    }ioapic;
+};
+
+struct dmar_scope {
+    DECLARE_BITMAP(buses, 256);         /* buses owned by this unit */
+    u16    *devices;                    /* devices owned by this unit */
+    int    devices_cnt;
+};
+
+struct acpi_drhd_unit {
+    struct dmar_scope scope;            /* must be first member of struct */
+    struct list_head list;
+    u64    address;                     /* register base address of the unit */
+    u8     include_all:1;
+    struct iommu *iommu;
+    struct list_head ioapic_list;
+};
+
+struct acpi_rmrr_unit {
+    struct dmar_scope scope;            /* must be first member of struct */
+    struct list_head list;
+    u64    base_address;
+    u64    end_address;
+    u8     allow_all:1;
+};
+
+struct acpi_atsr_unit {
+    struct dmar_scope scope;            /* must be first member of struct */
+    struct list_head list;
+    u8     all_ports:1;
+};
+
+
+#define for_each_drhd_unit(drhd) \
+    list_for_each_entry(drhd, &acpi_drhd_units, list)
+
+#define for_each_rmrr_device(rmrr, bdf, idx)            \
+    list_for_each_entry(rmrr, &acpi_rmrr_units, list)   \
+        /* assume there never is a bdf == 0 */          \
+        for (idx = 0; (bdf = rmrr->scope.devices[idx]) && \
+                 idx < rmrr->scope.devices_cnt; idx++)
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn);
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn);
+void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec, u16 sub);
+void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec, u16 sub);
+
+#define DMAR_TYPE 1
+#define RMRR_TYPE 2
+#define ATSR_TYPE 3
+
+#define DMAR_OPERATION_TIMEOUT MILLISECS(1000)
+
+int vtd_hw_check(void);
+void disable_pmr(struct iommu *iommu);
+int is_usb_device(u8 bus, u8 devfn);
+
+#endif /* _DMAR_H_ */
diff -Naurp xen/drivers/passthrough/vtd/extern.h xen-redhat/drivers/passthrough/vtd/extern.h
--- xen/drivers/passthrough/vtd/extern.h
+++ xen-redhat/drivers/passthrough/vtd/extern.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#ifndef _VTD_EXTERN_H_
+#define _VTD_EXTERN_H_
+
+#include "dmar.h"
+
+extern struct qi_ctrl *qi_ctrl;
+extern struct ir_ctrl *ir_ctrl;
+
+void print_iommu_regs(struct acpi_drhd_unit *drhd);
+void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn);
+void dump_iommu_info(unsigned char key);
+
+int qinval_setup(struct iommu *iommu);
+int intremap_setup(struct iommu *iommu);
+int queue_invalidate_context(struct iommu *iommu,
+    u16 did, u16 source_id, u8 function_mask, u8 granu);
+int queue_invalidate_iotlb(struct iommu *iommu,
+    u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr);
+int queue_invalidate_iec(struct iommu *iommu,
+    u8 granu, u8 im, u16 iidx);
+int invalidate_sync(struct iommu *iommu);
+int iommu_flush_iec_global(struct iommu *iommu);
+int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx);
+struct iommu * ioapic_to_iommu(unsigned int apic_id);
+struct acpi_drhd_unit * ioapic_to_drhd(unsigned int apic_id);
+void clear_fault_bits(struct iommu *iommu);
+
+#endif // _VTD_EXTERN_H_
diff -Naurp xen/drivers/passthrough/vtd/intremap.c xen-redhat/drivers/passthrough/vtd/intremap.c
--- xen/drivers/passthrough/vtd/intremap.c
+++ xen-redhat/drivers/passthrough/vtd/intremap.c
@@ -0,0 +1,703 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com>
+ */
+
+#include <xen/irq.h>
+#include <xen/sched.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <xen/time.h>
+#include <xen/list.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include "iommu.h"
+#include "dmar.h"
+#include "vtd.h"
+#include "extern.h"
+
+/* The max number of IOAPIC (or IOSAPIC) pin. The typical values can be 24 or
+ * 48 on x86 and Itanium platforms. Here we use a biger number 256. This
+ * should be big enough. Actually now IREMAP_ENTRY_NR is also 256.
+ */
+#define MAX_IOAPIC_PIN_NUM  256
+
+struct ioapicid_pin_intremap_index {
+	struct list_head list;
+	unsigned int ioapic_id;
+	unsigned int pin;
+	int intremap_index;
+};
+
+static struct list_head ioapic_pin_to_intremap_index[MAX_IOAPIC_PIN_NUM];
+
+static int init_ioapic_pin_intremap_index(void)
+{
+    static int initialized = 0;
+    int i;
+
+    if ( initialized == 1 )
+        return 0;
+
+    for ( i = 0; i < MAX_IOAPIC_PIN_NUM; i++ )
+        INIT_LIST_HEAD(&ioapic_pin_to_intremap_index[i]);
+
+    initialized = 1;
+    return 0;
+}
+
+static int get_ioapic_pin_intremap_index(unsigned int ioapic_id,
+                                         unsigned int pin)
+{
+    struct ioapicid_pin_intremap_index *entry;
+    struct list_head *pos, *tmp;
+
+    list_for_each_safe ( pos, tmp, &ioapic_pin_to_intremap_index[pin] )
+    {
+        entry = list_entry(pos, struct ioapicid_pin_intremap_index, list);
+        if ( entry->ioapic_id == ioapic_id )
+            return entry->intremap_index;
+    }
+
+    return -1;
+}
+
+static int set_ioapic_pin_intremap_index(unsigned int ioapic_id,
+                                         unsigned int pin,
+                                         int index)
+{
+    struct ioapicid_pin_intremap_index *entry;
+
+    entry = xmalloc(struct ioapicid_pin_intremap_index);
+    if ( !entry )
+        return -ENOMEM;
+
+    entry->ioapic_id = ioapic_id;
+    entry->pin = pin;
+    entry->intremap_index = index;
+
+    list_add_tail(&entry->list, &ioapic_pin_to_intremap_index[pin]);
+
+    return 0;
+}
+
+u16 apicid_to_bdf(int apic_id)
+{
+    struct acpi_drhd_unit *drhd = ioapic_to_drhd(apic_id);
+    struct acpi_ioapic_unit *acpi_ioapic_unit;
+
+    list_for_each_entry ( acpi_ioapic_unit, &drhd->ioapic_list, list )
+        if ( acpi_ioapic_unit->apic_id == apic_id )
+            return acpi_ioapic_unit->ioapic.info;
+
+    dprintk(XENLOG_ERR VTDPREFIX, "Didn't find the bdf for the apic_id!\n");
+    return 0;
+}
+
+/* Mark specified intr remap entry as free */
+static void free_remap_entry(struct iommu *iommu, int index)
+{
+    struct iremap_entry *iremap_entry = NULL, *iremap_entries;
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+
+    if ( index < 0 || index > IREMAP_ENTRY_NR - 1 )
+        return;
+
+    ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) );
+
+    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index,
+                     iremap_entries, iremap_entry);
+
+    memset(iremap_entry, 0, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
+    iommu_flush_iec_index(iommu, 0, index);
+
+    unmap_vtd_domain_page(iremap_entries);
+    ir_ctrl->iremap_num--;
+}
+
+/*
+ * Look for a free intr remap entry.
+ * Need hold iremap_lock, and setup returned entry before releasing lock.
+ */
+static int alloc_remap_entry(struct iommu *iommu)
+{
+    struct iremap_entry *iremap_entries = NULL;
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+    int i;
+
+    ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) );
+
+    for ( i = 0; i < IREMAP_ENTRY_NR; i++ )
+    {
+        struct iremap_entry *p;
+        if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 )
+        {
+            /* This entry across page boundry */
+            if ( iremap_entries )
+                unmap_vtd_domain_page(iremap_entries);
+
+            GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, i,
+                             iremap_entries, p);
+        }
+        else
+            p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)];
+
+        if ( p->lo_val == 0 && p->hi_val == 0 ) /* a free entry */
+            break;
+    }
+
+    if ( iremap_entries )
+        unmap_vtd_domain_page(iremap_entries);
+
+    if ( i < IREMAP_ENTRY_NR )
+        ir_ctrl->iremap_num++;
+    return i;
+}
+
+static int remap_entry_to_ioapic_rte(
+    struct iommu *iommu, int index, struct IO_xAPIC_route_entry *old_rte)
+{
+    struct iremap_entry *iremap_entry = NULL, *iremap_entries;
+    unsigned long flags;
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+
+    if ( ir_ctrl == NULL )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
+        return -EFAULT;
+    }
+
+    if ( index < 0 || index > IREMAP_ENTRY_NR - 1 )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "%s: index (%d) for remap table is invalid !\n",
+                __func__, index);
+        return -EFAULT;
+    }
+
+    spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
+
+    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index,
+                     iremap_entries, iremap_entry);
+
+    if ( iremap_entry->hi_val == 0 && iremap_entry->lo_val == 0 )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "%s: index (%d) get an empty entry!\n",
+                __func__, index);
+        unmap_vtd_domain_page(iremap_entries);
+        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+        return -EFAULT;
+    }
+
+    old_rte->vector = iremap_entry->lo.vector;
+    old_rte->delivery_mode = iremap_entry->lo.dlm;
+    old_rte->dest_mode = iremap_entry->lo.dm;
+    old_rte->trigger = iremap_entry->lo.tm;
+    old_rte->__reserved_2 = 0;
+    old_rte->dest.logical.__reserved_1 = 0;
+    old_rte->dest.logical.logical_dest = iremap_entry->lo.dst >> 8;
+
+    unmap_vtd_domain_page(iremap_entries);
+    spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+    return 0;
+}
+
+static int ioapic_rte_to_remap_entry(struct iommu *iommu,
+    int apic_id, unsigned int ioapic_pin, struct IO_xAPIC_route_entry *old_rte,
+    unsigned int rte_upper, unsigned int value)
+{
+    struct iremap_entry *iremap_entry = NULL, *iremap_entries;
+    struct iremap_entry new_ire;
+    struct IO_APIC_route_remap_entry *remap_rte;
+    struct IO_xAPIC_route_entry new_rte;
+    int index;
+    unsigned long flags;
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+
+    remap_rte = (struct IO_APIC_route_remap_entry *) old_rte;
+    spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
+
+    index = get_ioapic_pin_intremap_index(apic_id, ioapic_pin);
+    if ( index < 0 )
+    {
+	index = alloc_remap_entry(iommu);
+        if ( index < IREMAP_ENTRY_NR )
+            set_ioapic_pin_intremap_index(apic_id, ioapic_pin, index);
+    }
+
+    if ( index > IREMAP_ENTRY_NR - 1 )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "%s: intremap index (%d) is larger than"
+                " the maximum index (%d)!\n",
+                __func__, index, IREMAP_ENTRY_NR - 1);
+        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+        return -EFAULT;
+    }
+
+    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index,
+                     iremap_entries, iremap_entry);
+
+    memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
+
+    if ( rte_upper )
+    {
+#if defined(__i386__) || defined(__x86_64__)
+        new_ire.lo.dst = (value >> 24) << 8;
+#else /* __ia64__ */
+        new_ire.lo.dst = value >> 16;
+#endif
+    }
+    else
+    {
+        *(((u32 *)&new_rte) + 0) = value;
+        new_ire.lo.fpd = 0;
+        new_ire.lo.dm = new_rte.dest_mode;
+        new_ire.lo.rh = 0;
+        new_ire.lo.tm = new_rte.trigger;
+        new_ire.lo.dlm = new_rte.delivery_mode;
+        new_ire.lo.avail = 0;
+        new_ire.lo.res_1 = 0;
+        new_ire.lo.vector = new_rte.vector;
+        new_ire.lo.res_2 = 0;
+        new_ire.hi.sid = apicid_to_bdf(apic_id);
+
+        new_ire.hi.sq = 0;    /* comparing all 16-bit of SID */
+        new_ire.hi.svt = 1;   /* requestor ID verification SID/SQ */
+        new_ire.hi.res_1 = 0;
+        new_ire.lo.p = 1;     /* finally, set present bit */
+
+        /* now construct new ioapic rte entry */
+        remap_rte->vector = new_rte.vector;
+        remap_rte->delivery_mode = 0;    /* has to be 0 for remap format */
+        remap_rte->index_15 = (index >> 15) & 0x1;
+        remap_rte->index_0_14 = index & 0x7fff;
+
+        remap_rte->delivery_status = new_rte.delivery_status;
+        remap_rte->polarity = new_rte.polarity;
+        remap_rte->irr = new_rte.irr;
+        remap_rte->trigger = new_rte.trigger;
+        remap_rte->mask = new_rte.mask;
+        remap_rte->reserved = 0;
+        remap_rte->format = 1;    /* indicate remap format */
+    }
+
+    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
+    iommu_flush_iec_index(iommu, 0, index);
+    invalidate_sync(iommu);
+
+    unmap_vtd_domain_page(iremap_entries);
+    spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+    return 0;
+}
+
+unsigned int io_apic_read_remap_rte(
+    unsigned int apic, unsigned int reg)
+{
+    struct IO_xAPIC_route_entry old_rte = { 0 };
+    struct IO_APIC_route_remap_entry *remap_rte;
+    int rte_upper = (reg & 1) ? 1 : 0;
+    struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic));
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+    unsigned int ioapic_pin = (reg - 0x10) / 2;
+    int index;
+
+    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ||
+        (ir_ctrl->iremap_num == 0) )
+    {
+        *IO_APIC_BASE(apic) = reg;
+        return *(IO_APIC_BASE(apic)+4);
+    }
+
+    index = get_ioapic_pin_intremap_index(IO_APIC_ID(apic), ioapic_pin);
+    if ( index < 0 )
+    {
+        *IO_APIC_BASE(apic) = reg;
+        return *(IO_APIC_BASE(apic)+4);
+    }
+
+    if ( rte_upper )
+        reg--;
+
+    /* read lower and upper 32-bits of rte entry */
+    *IO_APIC_BASE(apic) = reg;
+    *(((u32 *)&old_rte) + 0) = *(IO_APIC_BASE(apic)+4);
+    *IO_APIC_BASE(apic) = reg + 1;
+    *(((u32 *)&old_rte) + 1) = *(IO_APIC_BASE(apic)+4);
+
+    remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
+
+    if ( remap_entry_to_ioapic_rte(iommu, index, &old_rte) )
+    {
+        *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
+        return *(IO_APIC_BASE(apic)+4);
+    }
+
+    if ( rte_upper )
+        return (*(((u32 *)&old_rte) + 1));
+    else
+        return (*(((u32 *)&old_rte) + 0));
+}
+
+void io_apic_write_remap_rte(
+    unsigned int apic, unsigned int reg, unsigned int value)
+{
+    unsigned int ioapic_pin = (reg - 0x10) / 2;
+    struct IO_xAPIC_route_entry old_rte = { 0 };
+    struct IO_APIC_route_remap_entry *remap_rte;
+    unsigned int rte_upper = (reg & 1) ? 1 : 0;
+    struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic));
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+    int saved_mask;
+
+    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
+    {
+        *IO_APIC_BASE(apic) = reg;
+        *(IO_APIC_BASE(apic)+4) = value;
+        return;
+    }
+
+    if ( rte_upper )
+        reg--;
+
+    /* read both lower and upper 32-bits of rte entry */
+    *IO_APIC_BASE(apic) = reg;
+    *(((u32 *)&old_rte) + 0) = *(IO_APIC_BASE(apic)+4);
+    *IO_APIC_BASE(apic) = reg + 1;
+    *(((u32 *)&old_rte) + 1) = *(IO_APIC_BASE(apic)+4);
+
+    remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
+
+    /* mask the interrupt while we change the intremap table */
+    saved_mask = remap_rte->mask;
+    remap_rte->mask = 1;
+    *IO_APIC_BASE(apic) = reg;
+    *(IO_APIC_BASE(apic)+4) = *(((int *)&old_rte)+0);
+    remap_rte->mask = saved_mask;
+
+    ASSERT(ioapic_pin < MAX_IOAPIC_PIN_NUM);
+    if ( ioapic_rte_to_remap_entry(iommu, IO_APIC_ID(apic), ioapic_pin,
+                                   &old_rte, rte_upper, value) )
+    {
+        *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
+        *(IO_APIC_BASE(apic)+4) = value;
+        return;
+    }
+
+    /* write new entry to ioapic */
+    *IO_APIC_BASE(apic) = reg;
+    *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+0);
+    *IO_APIC_BASE(apic) = reg + 1;
+    *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1);
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static int remap_entry_to_msi_msg(
+    struct iommu *iommu, struct msi_msg *msg)
+{
+    struct iremap_entry *iremap_entry = NULL, *iremap_entries;
+    struct msi_msg_remap_entry *remap_rte;
+    int index;
+    unsigned long flags;
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+
+    if ( ir_ctrl == NULL )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "remap_entry_to_msi_msg: ir_ctl == NULL");
+        return -EFAULT;
+    }
+
+    remap_rte = (struct msi_msg_remap_entry *) msg;
+    index = (remap_rte->address_lo.index_15 << 15) |
+             remap_rte->address_lo.index_0_14;
+
+    if ( index < 0 || index > IREMAP_ENTRY_NR - 1 )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "%s: index (%d) for remap table is invalid !\n",
+                __func__, index);
+        return -EFAULT;
+    }
+
+    spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
+
+    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index,
+                     iremap_entries, iremap_entry);
+
+    if ( iremap_entry->hi_val == 0 && iremap_entry->lo_val == 0 )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "%s: index (%d) get an empty entry!\n",
+                __func__, index);
+        unmap_vtd_domain_page(iremap_entries);
+        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+        return -EFAULT;
+    }
+
+    msg->address_hi = MSI_ADDR_BASE_HI;
+    msg->address_lo =
+        MSI_ADDR_BASE_LO |
+        ((iremap_entry->lo.dm == 0) ?
+            MSI_ADDR_DESTMODE_PHYS:
+            MSI_ADDR_DESTMODE_LOGIC) |
+        ((iremap_entry->lo.dlm != dest_LowestPrio) ?
+            MSI_ADDR_REDIRECTION_CPU:
+            MSI_ADDR_REDIRECTION_LOWPRI) |
+        iremap_entry->lo.dst >> 8;
+
+    msg->data =
+        MSI_DATA_TRIGGER_EDGE |
+        MSI_DATA_LEVEL_ASSERT |
+        ((iremap_entry->lo.dlm != dest_LowestPrio) ?
+            MSI_DATA_DELIVERY_FIXED:
+            MSI_DATA_DELIVERY_LOWPRI) |
+        iremap_entry->lo.vector;
+
+    unmap_vtd_domain_page(iremap_entries);
+    spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+    return 0;
+}
+
+static int msi_msg_to_remap_entry(
+    struct iommu *iommu, struct pci_dev *pdev,
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    struct iremap_entry *iremap_entry = NULL, *iremap_entries;
+    struct iremap_entry new_ire;
+    struct msi_msg_remap_entry *remap_rte;
+    unsigned int index;
+    unsigned long flags;
+    struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
+
+    remap_rte = (struct msi_msg_remap_entry *) msg;
+    spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
+
+    if ( msg == NULL )
+    {
+        /* Free specified unused IRTE */
+        free_remap_entry(iommu, msi_desc->remap_index);
+        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+        return 0;
+    }
+
+    if ( msi_desc->remap_index < 0 )
+    {
+        /*
+         * TODO: Multiple-vector MSI requires allocating multiple continuous
+         * entries and configuring addr/data of msi_msg in different way. So
+         * alloca_remap_entry will be changed if enabling multiple-vector MSI
+         * in future.
+         */
+        index = alloc_remap_entry(iommu);
+        msi_desc->remap_index = index;
+    }
+    else
+        index = msi_desc->remap_index;
+
+    if ( index > IREMAP_ENTRY_NR - 1 )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "%s: intremap index (%d) is larger than"
+                " the maximum index (%d)!\n",
+                __func__, index, IREMAP_ENTRY_NR - 1);
+        msi_desc->remap_index = -1;
+        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+        return -EFAULT;
+    }
+
+    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index,
+                     iremap_entries, iremap_entry);
+
+    memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
+
+    /* Set interrupt remapping table entry */
+    new_ire.lo.fpd = 0;
+    new_ire.lo.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
+    new_ire.lo.rh = 0;
+    new_ire.lo.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
+    new_ire.lo.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
+    new_ire.lo.avail = 0;
+    new_ire.lo.res_1 = 0;
+    new_ire.lo.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
+                        MSI_DATA_VECTOR_MASK;
+    new_ire.lo.res_2 = 0;
+    new_ire.lo.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
+                      & 0xff) << 8;
+
+    new_ire.hi.sid = (pdev->bus << 8) | pdev->devfn;
+    new_ire.hi.sq = 0;
+    new_ire.hi.svt = 1;
+    new_ire.hi.res_1 = 0;
+    new_ire.lo.p = 1;    /* finally, set present bit */
+
+    /* now construct new MSI/MSI-X rte entry */
+    remap_rte->address_lo.dontcare = 0;
+    remap_rte->address_lo.index_15 = (index >> 15) & 0x1;
+    remap_rte->address_lo.index_0_14 = index & 0x7fff;
+    remap_rte->address_lo.SHV = 1;
+    remap_rte->address_lo.format = 1;
+
+    remap_rte->address_hi = 0;
+    remap_rte->data = 0;
+
+    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
+    iommu_flush_iec_index(iommu, 0, index);
+    invalidate_sync(iommu);
+
+    unmap_vtd_domain_page(iremap_entries);
+    spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+    return 0;
+}
+
+void msi_msg_read_remap_rte(
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    struct pci_dev *pdev = msi_desc->dev;
+    struct acpi_drhd_unit *drhd = NULL;
+    struct iommu *iommu = NULL;
+    struct ir_ctrl *ir_ctrl;
+
+    drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+    if (!drhd)
+	return;
+    iommu = drhd->iommu;
+
+    ir_ctrl = iommu_ir_ctrl(iommu);
+    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
+        return;
+
+    remap_entry_to_msi_msg(iommu, msg);
+}
+
+void msi_msg_write_remap_rte(
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    struct pci_dev *pdev = msi_desc->dev;
+    struct acpi_drhd_unit *drhd = NULL;
+    struct iommu *iommu = NULL;
+    struct ir_ctrl *ir_ctrl;
+
+    drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+    if (!drhd)
+	return;
+    iommu = drhd->iommu;
+
+    ir_ctrl = iommu_ir_ctrl(iommu);
+    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
+        return;
+
+    msi_msg_to_remap_entry(iommu, pdev, msi_desc, msg);
+}
+#elif defined(__ia64__)
+void msi_msg_read_remap_rte(
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    /* TODO. */
+}
+
+void msi_msg_write_remap_rte(
+    struct msi_desc *msi_desc, struct msi_msg *msg)
+{
+    /* TODO. */
+}
+#endif
+
+int intremap_setup(struct iommu *iommu)
+{
+    struct ir_ctrl *ir_ctrl;
+    s_time_t start_time;
+
+    if ( !ecap_intr_remap(iommu->ecap) )
+        return -ENODEV;
+
+    ir_ctrl = iommu_ir_ctrl(iommu);
+    if ( ir_ctrl->iremap_maddr == 0 )
+    {
+        ir_ctrl->iremap_maddr = alloc_pgtable_maddr(NULL);
+        if ( ir_ctrl->iremap_maddr == 0 )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Cannot allocate memory for ir_ctrl->iremap_maddr\n");
+            return -ENOMEM;
+        }
+        ir_ctrl->iremap_num = 0;
+    }
+
+#if defined(ENABLED_EXTENDED_INTERRUPT_SUPPORT)
+    /* set extended interrupt mode bit */
+    ir_ctrl->iremap_maddr |=
+            ecap_ext_intr(iommu->ecap) ? (1 << IRTA_REG_EIME_SHIFT) : 0;
+#endif
+    /* set size of the interrupt remapping table */
+    ir_ctrl->iremap_maddr |= IRTA_REG_TABLE_SIZE;
+    dmar_writeq(iommu->reg, DMAR_IRTA_REG, ir_ctrl->iremap_maddr);
+
+    /* set SIRTP */
+    iommu->gcmd |= DMA_GCMD_SIRTP;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_SIRTPS) )
+    {
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            panic("Cannot set SIRTP field for interrupt remapping\n");
+        cpu_relax();
+    }
+
+    /* enable comaptiblity format interrupt pass through */
+    iommu->gcmd |= DMA_GCMD_CFI;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    start_time = NOW();
+    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_CFIS) )
+    {
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            panic("Cannot set CFI field for interrupt remapping\n");
+        cpu_relax();
+    }
+
+    /* enable interrupt remapping hardware */
+    iommu->gcmd |= DMA_GCMD_IRE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    start_time = NOW();
+    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_IRES) )
+    {
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+        {
+            dprintk(XENLOG_ERR VTDPREFIX,
+                    "Cannot set IRE field for interrupt remapping\n");
+            return -ENODEV;
+        }
+        cpu_relax();
+    }
+
+    /* After set SIRTP, we should do globally invalidate the IEC */
+    iommu_flush_iec_global(iommu);
+
+    init_ioapic_pin_intremap_index();
+
+    return 0;
+}
diff -Naurp xen/drivers/passthrough/vtd/iommu.c xen-redhat/drivers/passthrough/vtd/iommu.c
--- xen/drivers/passthrough/vtd/iommu.c
+++ xen-redhat/drivers/passthrough/vtd/iommu.c
@@ -0,0 +1,1966 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
+ */
+
+#include <xen/irq.h>
+#include <xen/sched.h>
+#include <xen/xmalloc.h>
+#include <xen/domain_page.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <xen/numa.h>
+#include <xen/time.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/keyhandler.h>
+#include "iommu.h"
+#include "dmar.h"
+#include "extern.h"
+#include "vtd.h"
+
+#define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
+
+static spinlock_t domid_bitmap_lock;    /* protect domain id bitmap */
+static int domid_bitmap_size;           /* domain id bitmap size in bits */
+static unsigned long *domid_bitmap;     /* iommu domain id bitmap */
+static int rwbf_quirk = 0;
+
+static void setup_dom0_devices(struct domain *d);
+static void setup_dom0_rmrr(struct domain *d);
+
+#define DID_FIELD_WIDTH 16
+#define DID_HIGH_OFFSET 8
+static void context_set_domain_id(struct context_entry *context,
+                                  struct domain *d)
+{
+    domid_t iommu_domid = domain_iommu_domid(d);
+
+    if ( iommu_domid == 0 )
+    {
+        spin_lock(&domid_bitmap_lock);
+        iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
+        set_bit(iommu_domid, domid_bitmap);
+        spin_unlock(&domid_bitmap_lock);
+        d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
+    }
+
+    context->hi &= (1 << DID_HIGH_OFFSET) - 1;
+    context->hi |= iommu_domid << DID_HIGH_OFFSET;
+}
+
+static void iommu_domid_release(struct domain *d)
+{
+    domid_t iommu_domid = domain_iommu_domid(d);
+
+    if ( iommu_domid != 0 )
+    {
+        d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
+        clear_bit(iommu_domid, domid_bitmap);
+    }
+}
+
+static struct intel_iommu *alloc_intel_iommu(void)
+{
+    struct intel_iommu *intel;
+
+    intel = xmalloc(struct intel_iommu);
+    if ( intel == NULL )
+        return NULL;
+    memset(intel, 0, sizeof(struct intel_iommu));
+
+    spin_lock_init(&intel->qi_ctrl.qinval_lock);
+    spin_lock_init(&intel->qi_ctrl.qinval_poll_lock);
+    spin_lock_init(&intel->ir_ctrl.iremap_lock);
+
+    return intel;
+}
+
+static void free_intel_iommu(struct intel_iommu *intel)
+{
+    xfree(intel);
+}
+
+struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu)
+{
+    return iommu ? &iommu->intel->qi_ctrl : NULL;
+}
+
+struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu)
+{
+    return iommu ? &iommu->intel->ir_ctrl : NULL;
+}
+
+struct iommu_flush *iommu_get_flush(struct iommu *iommu)
+{
+    return iommu ? &iommu->intel->flush : NULL;
+}
+
+static unsigned int clflush_size;
+static int iommus_incoherent;
+static void __iommu_flush_cache(void *addr, int size)
+{
+    int i;
+
+    if ( !iommus_incoherent )
+        return;
+
+    for ( i = 0; i < size; i += clflush_size )
+        cacheline_flush((char *)addr + i);
+}
+
+void iommu_flush_cache_entry(void *addr)
+{
+    __iommu_flush_cache(addr, 8);
+}
+
+void iommu_flush_cache_page(void *addr)
+{
+    __iommu_flush_cache(addr, PAGE_SIZE_4K);
+}
+
+int nr_iommus;
+/* context entry handling */
+static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
+{
+    struct root_entry *root, *root_entries;
+    u64 maddr;
+
+    ASSERT(spin_is_locked(&iommu->lock));
+    root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
+    root = &root_entries[bus];
+    if ( !root_present(*root) )
+    {
+        maddr = alloc_pgtable_maddr(NULL);
+        if ( maddr == 0 )
+        {
+            unmap_vtd_domain_page(root_entries);
+            return 0;
+        }
+        set_root_value(*root, maddr);
+        set_root_present(*root);
+        iommu_flush_cache_entry(root);
+    }
+    maddr = (u64) get_context_addr(*root);
+    unmap_vtd_domain_page(root_entries);
+    return maddr;
+}
+
+static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    int addr_width = agaw_to_width(hd->agaw);
+    struct dma_pte *parent, *pte = NULL;
+    int level = agaw_to_level(hd->agaw);
+    int offset;
+    u64 pte_maddr = 0, maddr;
+    u64 *vaddr = NULL;
+
+    addr &= (((u64)1) << addr_width) - 1;
+    ASSERT(spin_is_locked(&hd->mapping_lock));
+    if ( hd->pgd_maddr == 0 )
+        if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) )
+            goto out;
+
+    parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
+    while ( level > 1 )
+    {
+        offset = address_level_offset(addr, level);
+        pte = &parent[offset];
+
+        if ( dma_pte_addr(*pte) == 0 )
+        {
+            if ( !alloc )
+                break;
+            maddr = alloc_pgtable_maddr(domain);
+            if ( !maddr )
+                break;
+            dma_set_pte_addr(*pte, maddr);
+            vaddr = map_vtd_domain_page(maddr);
+
+            /*
+             * high level table always sets r/w, last level
+             * page table control read/write
+             */
+            dma_set_pte_readable(*pte);
+            dma_set_pte_writable(*pte);
+            iommu_flush_cache_entry(pte);
+        }
+        else
+        {
+            vaddr = map_vtd_domain_page(pte->val);
+        }
+
+        if ( level == 2 )
+        {
+            pte_maddr = pte->val & PAGE_MASK_4K;
+            unmap_vtd_domain_page(vaddr);
+            break;
+        }
+
+        unmap_vtd_domain_page(parent);
+        parent = (struct dma_pte *)vaddr;
+        vaddr = NULL;
+        level--;
+    }
+
+    unmap_vtd_domain_page(parent);
+ out:
+    return pte_maddr;
+}
+
+static void iommu_flush_write_buffer(struct iommu *iommu)
+{
+    u32 val;
+    unsigned long flag;
+    s_time_t start_time;
+
+    if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
+        return;
+    val = iommu->gcmd | DMA_GCMD_WBF;
+
+    spin_lock_irqsave(&iommu->register_lock, flag);
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( !(val & DMA_GSTS_WBFS) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static int flush_context_reg(
+    void *_iommu,
+    u16 did, u16 source_id, u8 function_mask, u64 type,
+    int non_present_entry_flush)
+{
+    struct iommu *iommu = (struct iommu *) _iommu;
+    u64 val = 0;
+    unsigned long flag;
+    s_time_t start_time;
+
+    /*
+     * In the non-present entry flush case, if hardware doesn't cache
+     * non-present entry we do nothing and if hardware cache non-present
+     * entry, we flush entries of domain 0 (the domain id is used to cache
+     * any non-present entries)
+     */
+    if ( non_present_entry_flush )
+    {
+        if ( !cap_caching_mode(iommu->cap) )
+            return 1;
+        else
+            did = 0;
+    }
+
+    /* use register invalidation */
+    switch ( type )
+    {
+    case DMA_CCMD_GLOBAL_INVL:
+        val = DMA_CCMD_GLOBAL_INVL;
+        break;
+    case DMA_CCMD_DOMAIN_INVL:
+        val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+        break;
+    case DMA_CCMD_DEVICE_INVL:
+        val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+            |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
+        break;
+    default:
+        BUG();
+    }
+    val |= DMA_CCMD_ICC;
+
+    spin_lock_irqsave(&iommu->register_lock, flag);
+    dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
+        if ( !(val & DMA_CCMD_ICC) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flag);
+    /* flush context entry will implicitly flush write buffer */
+    return 0;
+}
+
+static int inline iommu_flush_context_global(
+    struct iommu *iommu, int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+                                 non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_domain(
+    struct iommu *iommu, u16 did, int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
+                                 non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_device(
+    struct iommu *iommu, u16 did, u16 source_id,
+    u8 function_mask, int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->context(iommu, did, source_id, function_mask,
+                                 DMA_CCMD_DEVICE_INVL,
+                                 non_present_entry_flush);
+}
+
+/* return value determine if we need a write buffer flush */
+static int flush_iotlb_reg(void *_iommu, u16 did,
+                               u64 addr, unsigned int size_order, u64 type,
+                               int non_present_entry_flush)
+{
+    struct iommu *iommu = (struct iommu *) _iommu;
+    int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+    u64 val = 0, val_iva = 0;
+    unsigned long flag;
+    s_time_t start_time;
+
+    /*
+     * In the non-present entry flush case, if hardware doesn't cache
+     * non-present entry we do nothing and if hardware cache non-present
+     * entry, we flush entries of domain 0 (the domain id is used to cache
+     * any non-present entries)
+     */
+    if ( non_present_entry_flush )
+    {
+        if ( !cap_caching_mode(iommu->cap) )
+            return 1;
+        else
+            did = 0;
+    }
+
+    /* use register invalidation */
+    switch ( type )
+    {
+    case DMA_TLB_GLOBAL_FLUSH:
+        /* global flush doesn't need set IVA_REG */
+        val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+        break;
+    case DMA_TLB_DSI_FLUSH:
+        val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+        break;
+    case DMA_TLB_PSI_FLUSH:
+        val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+        /* Note: always flush non-leaf currently */
+        val_iva = size_order | addr;
+        break;
+    default:
+        BUG();
+    }
+    /* Note: set drain read/write */
+    if ( cap_read_drain(iommu->cap) )
+        val |= DMA_TLB_READ_DRAIN;
+    if ( cap_write_drain(iommu->cap) )
+        val |= DMA_TLB_WRITE_DRAIN;
+
+    spin_lock_irqsave(&iommu->register_lock, flag);
+    /* Note: Only uses first TLB reg currently */
+    if ( val_iva )
+        dmar_writeq(iommu->reg, tlb_offset, val_iva);
+    dmar_writeq(iommu->reg, tlb_offset + 8, val);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        val = dmar_readq(iommu->reg, tlb_offset + 8);
+        if ( !(val & DMA_TLB_IVT) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+    /* check IOTLB invalidation granularity */
+    if ( DMA_TLB_IAIG(val) == 0 )
+        dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
+
+    /* flush iotlb entry will implicitly flush write buffer */
+    return 0;
+}
+
+static int inline iommu_flush_iotlb_global(struct iommu *iommu,
+                                           int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+                               non_present_entry_flush);
+}
+
+static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
+                                        int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
+                               non_present_entry_flush);
+}
+
+static int inline get_alignment(u64 base, unsigned int size)
+{
+    int t = 0;
+    u64 end;
+
+    end = base + size - 1;
+    while ( base != end )
+    {
+        t++;
+        base >>= 1;
+        end >>= 1;
+    }
+    return t;
+}
+
+static int inline iommu_flush_iotlb_psi(
+    struct iommu *iommu, u16 did,
+    u64 addr, unsigned int pages, int non_present_entry_flush)
+{
+    unsigned int align;
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+
+    ASSERT(!(addr & (~PAGE_MASK_4K)));
+    ASSERT(pages > 0);
+
+    /* Fallback to domain selective flush if no PSI support */
+    if ( !cap_pgsel_inv(iommu->cap) )
+        return iommu_flush_iotlb_dsi(iommu, did,
+                                     non_present_entry_flush);
+
+    /*
+     * PSI requires page size is 2 ^ x, and the base address is naturally
+     * aligned to the size
+     */
+    align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
+    /* Fallback to domain selective flush if size is too big */
+    if ( align > cap_max_amask_val(iommu->cap) )
+        return iommu_flush_iotlb_dsi(iommu, did,
+                                     non_present_entry_flush);
+
+    addr >>= PAGE_SHIFT_4K + align;
+    addr <<= PAGE_SHIFT_4K + align;
+
+    return flush->iotlb(iommu, did, addr, align,
+                               DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+}
+
+void iommu_flush_all(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    flush_all_cache();
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        iommu_flush_context_global(iommu, 0);
+        iommu_flush_iotlb_global(iommu, 0);
+    }
+}
+
+/* clear one page's page table */
+static void dma_pte_clear_one(struct domain *domain, u64 addr)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *page = NULL, *pte = NULL;
+    u64 pg_maddr;
+
+    spin_lock(&hd->mapping_lock);
+    /* get last level pte */
+    pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
+    if ( pg_maddr == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
+        return;
+    }
+
+    page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
+    pte = page + address_level_offset(addr, 1);
+
+    if ( !dma_pte_present(*pte) )
+    {
+        spin_unlock(&hd->mapping_lock);
+        unmap_vtd_domain_page(page);
+        return;
+    }
+
+    dma_clear_pte(*pte);
+    spin_unlock(&hd->mapping_lock);
+    iommu_flush_cache_entry(pte);
+
+    /* No need pcidevs_lock here since do that on assign/deassign device*/
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        if ( test_bit(iommu->index, &hd->iommu_bitmap) )
+            if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
+                                       addr, 1, 0))
+                iommu_flush_write_buffer(iommu);
+    }
+
+    unmap_vtd_domain_page(page);
+}
+
+static void iommu_free_pagetable(u64 pt_maddr, int level)
+{
+    int i;
+    struct dma_pte *pt_vaddr, *pte;
+    int next_level = level - 1;
+
+    if ( pt_maddr == 0 )
+        return;
+
+    pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
+
+    for ( i = 0; i < PTE_NUM; i++ )
+    {
+        pte = &pt_vaddr[i];
+        if ( !dma_pte_present(*pte) )
+            continue;
+
+        if ( next_level >= 1 )
+            iommu_free_pagetable(dma_pte_addr(*pte), next_level);
+
+        dma_clear_pte(*pte);
+        iommu_flush_cache_entry(pte);
+    }
+
+    unmap_vtd_domain_page(pt_vaddr);
+    free_pgtable_maddr(pt_maddr);
+}
+
+static int iommu_set_root_entry(struct iommu *iommu)
+{
+    u32 cmd, sts;
+    unsigned long flags;
+    s_time_t start_time;
+
+    spin_lock(&iommu->lock);
+
+    if ( iommu->root_maddr == 0 )
+        iommu->root_maddr = alloc_pgtable_maddr(NULL);
+    if ( iommu->root_maddr == 0 )
+    {
+        spin_unlock(&iommu->lock);
+        return -ENOMEM;
+    }
+
+    spin_unlock(&iommu->lock);
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
+    cmd = iommu->gcmd | DMA_GCMD_SRTP;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( sts & DMA_GSTS_RTPS )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+    return 0;
+}
+
+static void iommu_enable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+    s_time_t start_time;
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+            "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd |= DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( sts & DMA_GSTS_TES )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+
+    /* Disable PMRs when VT-d engine takes effect per spec definition */
+    disable_pmr(iommu);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+int iommu_disable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+    s_time_t start_time;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd &= ~ DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( !(sts & DMA_GSTS_TES) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return 0;
+}
+
+static struct iommu *vector_to_iommu[NR_VECTORS];
+static int iommu_page_fault_do_one(struct iommu *iommu, int type,
+                                   u8 fault_reason, u16 source_id, u64 addr)
+{
+    dprintk(XENLOG_WARNING VTDPREFIX,
+            "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
+            "iommu->reg = %p\n",
+            (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
+            PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
+            fault_reason, iommu->reg);
+
+#ifndef __i386__ /* map_domain_page() cannot be used in this context */
+    if ( fault_reason < 0x20 )
+        print_vtd_entries(iommu, (source_id >> 8),
+                          (source_id & 0xff), (addr >> PAGE_SHIFT));
+#endif
+
+    return 0;
+}
+
+static void iommu_fault_status(u32 fault_status)
+{
+    if ( fault_status & DMA_FSTS_PFO )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Fault Overflow\n");
+    if ( fault_status & DMA_FSTS_PPF )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Primary Pending Fault\n");
+    if ( fault_status & DMA_FSTS_AFO )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Advanced Fault Overflow\n");
+    if ( fault_status & DMA_FSTS_APF )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Advanced Pending Fault\n");
+    if ( fault_status & DMA_FSTS_IQE )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Invalidation Queue Error\n");
+    if ( fault_status & DMA_FSTS_ICE )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Invalidation Completion Error\n");
+    if ( fault_status & DMA_FSTS_ITE )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Invalidation Time-out Error\n");
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+static void iommu_page_fault(int vector, void *dev_id,
+                             struct cpu_user_regs *regs)
+{
+    struct iommu *iommu = dev_id;
+    int reg, fault_index;
+    u32 fault_status;
+    unsigned long flags;
+
+    dprintk(XENLOG_WARNING VTDPREFIX,
+            "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
+
+    fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
+
+    iommu_fault_status(fault_status);
+
+    /* FIXME: ignore advanced fault log */
+    if ( !(fault_status & DMA_FSTS_PPF) )
+        goto clear_overflow;
+
+    fault_index = dma_fsts_fault_record_index(fault_status);
+    reg = cap_fault_reg_offset(iommu->cap);
+    while (1)
+    {
+        u8 fault_reason;
+        u16 source_id, cword;
+        u32 data;
+        u64 guest_addr;
+        int type;
+
+        /* highest 32 bits */
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        data = dmar_readl(iommu->reg, reg +
+                          fault_index * PRIMARY_FAULT_REG_LEN + 12);
+        if ( !(data & DMA_FRCD_F) )
+        {
+            spin_unlock_irqrestore(&iommu->register_lock, flags);
+            break;
+        }
+
+        fault_reason = dma_frcd_fault_reason(data);
+        type = dma_frcd_type(data);
+
+        data = dmar_readl(iommu->reg, reg +
+                          fault_index * PRIMARY_FAULT_REG_LEN + 8);
+        source_id = dma_frcd_source_id(data);
+
+        guest_addr = dmar_readq(iommu->reg, reg +
+                                fault_index * PRIMARY_FAULT_REG_LEN);
+        guest_addr = dma_frcd_page_addr(guest_addr);
+        /* clear the fault */
+        dmar_writel(iommu->reg, reg +
+                    fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+        iommu_page_fault_do_one(iommu, type, fault_reason,
+                                source_id, guest_addr);
+
+        /* Tell the device to stop DMAing; we can't rely on the guest to
+         * control it for us. */
+        cword = pci_conf_read16(PCI_BUS(source_id), PCI_SLOT(source_id),
+                                PCI_FUNC(source_id), PCI_COMMAND);
+        pci_conf_write16(PCI_BUS(source_id), PCI_SLOT(source_id),
+                         PCI_FUNC(source_id), PCI_COMMAND,
+                         cword & ~PCI_COMMAND_MASTER);
+
+        fault_index++;
+        if ( fault_index > cap_num_fault_regs(iommu->cap) )
+            fault_index = 0;
+    }
+clear_overflow:
+    /* clear primary fault overflow */
+    fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+    if ( fault_status & DMA_FSTS_PFO )
+    {
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+    }
+}
+
+static void dma_msi_unmask(unsigned int vector)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    unsigned long flags;
+
+    /* unmask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_mask(unsigned int vector)
+{
+    unsigned long flags;
+    struct iommu *iommu = vector_to_iommu[vector];
+
+    /* mask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static unsigned int dma_msi_startup(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    return 0;
+}
+
+static void dma_msi_end(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    ack_APIC_irq();
+}
+
+static void dma_msi_data_init(struct iommu *iommu, int vector)
+{
+    u32 msi_data = 0;
+    unsigned long flags;
+
+    /* Fixed, edge, assert mode. Follow MSI setting */
+    msi_data |= vector & 0xff;
+    msi_data |= 1 << 14;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+#ifdef SUPPORT_MSI_REMAPPING
+static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
+{
+    u64 msi_address;
+    unsigned long flags;
+
+    /* Physical, dedicated cpu. Follow MSI setting */
+    msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
+    msi_address |= MSI_PHYSICAL_MODE << 2;
+    msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
+    msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
+    dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+#else
+static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
+{
+    /* ia64: TODO */
+}
+#endif
+
+static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
+}
+
+static struct hw_interrupt_type dma_msi_type = {
+    .typename = "DMA_MSI",
+    .startup = dma_msi_startup,
+    .shutdown = dma_msi_mask,
+    .enable = dma_msi_unmask,
+    .disable = dma_msi_mask,
+    .ack = dma_msi_mask,
+    .end = dma_msi_end,
+    .set_affinity = dma_msi_set_affinity,
+};
+
+int iommu_set_interrupt(struct iommu *iommu)
+{
+    int vector, ret;
+
+    vector = assign_irq_vector(AUTO_ASSIGN);
+    vector_to_iommu[vector] = iommu;
+
+    if ( !vector )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
+        return -EINVAL;
+    }
+
+    irq_desc[vector].handler = &dma_msi_type;
+    ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
+    if ( ret )
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
+    return vector;
+}
+
+static int iommu_alloc(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu;
+    unsigned long sagaw;
+    int agaw;
+
+    if ( nr_iommus > MAX_IOMMUS )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
+        return -ENOMEM;
+    }
+
+    iommu = xmalloc(struct iommu);
+    if ( iommu == NULL )
+        return -ENOMEM;
+    memset(iommu, 0, sizeof(struct iommu));
+
+    iommu->intel = alloc_intel_iommu();
+    if ( iommu->intel == NULL )
+    {
+        xfree(iommu);
+        return -ENOMEM;
+    }
+
+    iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
+    iommu->index = nr_iommus++;
+
+    iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
+    iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
+
+    /* Calculate number of pagetable levels: between 2 and 4. */
+    sagaw = cap_sagaw(iommu->cap);
+    for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
+        if ( test_bit(agaw, &sagaw) )
+            break;
+    if ( agaw < 0 )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "IOMMU: unsupported sagaw %lx\n", sagaw);
+        xfree(iommu);
+        return -ENODEV;
+    }
+    iommu->nr_pt_levels = agaw_to_level(agaw);
+
+    if ( !ecap_coherent(iommu->ecap) )
+        iommus_incoherent = 1;
+
+    spin_lock_init(&iommu->lock);
+    spin_lock_init(&iommu->register_lock);
+
+    drhd->iommu = iommu;
+    return 0;
+}
+
+static void iommu_free(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+
+    if ( iommu == NULL )
+        return;
+
+    if ( iommu->root_maddr != 0 )
+    {
+        free_pgtable_maddr(iommu->root_maddr);
+        iommu->root_maddr = 0;
+    }
+
+    if ( iommu->reg )
+        iounmap(iommu->reg);
+
+    free_intel_iommu(iommu->intel);
+    free_irq(iommu->vector);
+    xfree(iommu);
+
+    drhd->iommu = NULL;
+}
+
+#define guestwidth_to_adjustwidth(gaw) ({       \
+    int agaw, r = (gaw - 12) % 9;               \
+    agaw = (r == 0) ? gaw : (gaw + 9 - r);      \
+    if ( agaw > 64 )                            \
+        agaw = 64;                              \
+    agaw; })
+
+static int intel_iommu_domain_init(struct domain *d)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    u64 i, j, tmp;
+    struct acpi_drhd_unit *drhd;
+
+    hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+
+    if ( d->domain_id == 0 )
+    {
+        extern int xen_in_range(paddr_t start, paddr_t end);
+
+        /*
+         * Set up 1:1 page table for dom0 except the critical segments
+         * like Xen.
+         */
+        for ( i = 0; i < max_page; i++ )
+        {
+            if ( xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) )
+                continue;
+
+            tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K);
+            for ( j = 0; j < tmp; j++ )
+                iommu_map_page(d, (i*tmp+j), (i*tmp+j));
+        }
+
+        setup_dom0_devices(d);
+        setup_dom0_rmrr(d);
+
+        iommu_flush_all();
+
+        for_each_drhd_unit ( drhd )
+            iommu_enable_translation(drhd->iommu);
+    }
+
+    return 0;
+}
+
+static int domain_context_mapping_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct context_entry *context, *context_entries;
+    u64 maddr, pgd_maddr;
+    struct pci_dev *pdev = NULL;
+    int agaw;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    spin_lock(&iommu->lock);
+    maddr = bus_to_context_maddr(iommu, bus);
+    context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
+    context = &context_entries[devfn];
+
+    if ( context_present(*context) )
+    {
+        int res = 0;
+
+        pdev = pci_get_pdev(bus, devfn);
+        if (!pdev)
+            res = -ENODEV;
+        else if (pdev->domain != domain)
+            res = -EINVAL;
+        unmap_vtd_domain_page(context_entries);
+        spin_unlock(&iommu->lock);
+        return res;
+    }
+
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+    {
+        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+        agaw = level_to_agaw(iommu->nr_pt_levels);
+    }
+    else
+    {
+        spin_lock(&hd->mapping_lock);
+
+        /* Ensure we have pagetables allocated down to leaf PTE. */
+        if ( hd->pgd_maddr == 0 )
+        {
+            addr_to_dma_page_maddr(domain, 0, 1);
+            if ( hd->pgd_maddr == 0 )
+            {
+            nomem:
+                spin_unlock(&hd->mapping_lock);
+                spin_unlock(&iommu->lock);
+                unmap_vtd_domain_page(context_entries);
+                return -ENOMEM;
+            }
+        }
+
+        /* Skip top levels of page tables for 2- and 3-level DRHDs. */
+        pgd_maddr = hd->pgd_maddr;
+        for ( agaw = level_to_agaw(4);
+              agaw != level_to_agaw(iommu->nr_pt_levels);
+              agaw-- )
+        {
+            struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
+            pgd_maddr = dma_pte_addr(*p);
+            unmap_vtd_domain_page(p);
+            if ( pgd_maddr == 0 )
+                goto nomem;
+        }
+
+        context_set_address_root(*context, pgd_maddr);
+        context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+        spin_unlock(&hd->mapping_lock);
+    }
+
+    /*
+     * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
+     * be 1 based as required by intel's iommu hw.
+     */
+    context_set_domain_id(context, domain);
+    context_set_address_width(*context, agaw);
+    context_set_fault_enable(*context);
+    context_set_present(*context);
+    iommu_flush_cache_entry(context);
+    spin_unlock(&iommu->lock);
+
+    /* Context entry was previously non-present (with domid 0). */
+    if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
+                                    DMA_CCMD_MASK_NOBIT, 1) )
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, 0, 1);
+
+    set_bit(iommu->index, &hd->iommu_bitmap);
+
+    unmap_vtd_domain_page(context_entries);
+
+    return 0;
+}
+
+#define PCI_BASE_CLASS_BRIDGE    0x06
+#define PCI_CLASS_BRIDGE_PCI     0x0604
+
+enum {
+    DEV_TYPE_PCIe_ENDPOINT,
+    DEV_TYPE_PCIe_BRIDGE,    // PCIe root port, switch
+    DEV_TYPE_PCI_BRIDGE,     // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
+    DEV_TYPE_PCI,
+};
+
+int pdev_type(u8 bus, u8 devfn)
+{
+    u16 class_device;
+    u16 status, creg;
+    int pos;
+    u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
+
+    class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
+    if ( class_device == PCI_CLASS_BRIDGE_PCI )
+    {
+        pos = pci_find_next_cap(bus, devfn,
+                                PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
+        if ( !pos )
+            return DEV_TYPE_PCI_BRIDGE;
+        creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
+        return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
+            DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
+    }
+
+    status = pci_conf_read16(bus, d, f, PCI_STATUS);
+    if ( !(status & PCI_STATUS_CAP_LIST) )
+        return DEV_TYPE_PCI;
+
+    if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
+        return DEV_TYPE_PCIe_ENDPOINT;
+
+    return DEV_TYPE_PCI;
+}
+
+#define MAX_BUSES 256
+static DEFINE_SPINLOCK(bus2bridge_lock);
+static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
+
+static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
+{
+    int cnt = 0;
+    *secbus = *bus;
+
+    ASSERT(spin_is_locked(&bus2bridge_lock));
+    if ( !bus2bridge[*bus].map )
+        return 0;
+
+    while ( bus2bridge[*bus].map )
+    {
+        *secbus = *bus;
+        *devfn = bus2bridge[*bus].devfn;
+        *bus = bus2bridge[*bus].bus;
+        if ( cnt++ >= MAX_BUSES )
+            return 0;
+    }
+
+    return 1;
+}
+
+static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
+{
+    int ret = 0;
+
+    if ( *bus == 0 )
+        /* assume integrated PCI devices in RC have valid requester-id */
+        return 1;
+
+    spin_lock(&bus2bridge_lock);
+    ret = _find_pcie_endpoint(bus, devfn, secbus);
+    spin_unlock(&bus2bridge_lock);
+
+    return ret;
+}
+
+static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
+{
+    struct acpi_drhd_unit *drhd;
+    int ret = 0;
+    u16 sec_bus, sub_bus;
+    u32 type;
+    u8 secbus, secdevfn;
+
+    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    if ( !drhd )
+        return -ENODEV;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    type = pdev_type(bus, devfn);
+    switch ( type )
+    {
+    case DEV_TYPE_PCIe_BRIDGE:
+        break;
+
+    case DEV_TYPE_PCI_BRIDGE:
+        sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                                 PCI_SECONDARY_BUS);
+        sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                                 PCI_SUBORDINATE_BUS);
+
+        spin_lock(&bus2bridge_lock);
+        for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
+        {
+            bus2bridge[sec_bus].map = 1;
+            bus2bridge[sec_bus].bus =  bus;
+            bus2bridge[sec_bus].devfn =  devfn;
+        }
+        spin_unlock(&bus2bridge_lock);
+        break;
+
+    case DEV_TYPE_PCIe_ENDPOINT:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+        break;
+
+    case DEV_TYPE_PCI:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+        if ( ret )
+           break;
+
+        secbus = bus;
+        secdevfn = devfn;
+        /* dependent devices mapping */
+        while ( bus2bridge[bus].map )
+        {
+            secbus = bus;
+            secdevfn = devfn;
+            devfn = bus2bridge[bus].devfn;
+            bus = bus2bridge[bus].bus;
+            ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+            if ( ret )
+                return ret;
+        }
+
+        if ( (secbus != bus) && (secdevfn != 0) )
+            /*
+             * The source-id for transactions on non-PCIe buses seem
+             * to originate from devfn=0 on the secondary bus behind
+             * the bridge.  Map that id as well.  The id to use in
+             * these scanarios is not particularly well documented
+             * anywhere.
+             */
+            ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
+        break;
+
+    default:
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+static int domain_context_unmap_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct context_entry *context, *context_entries;
+    u64 maddr;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    spin_lock(&iommu->lock);
+
+    maddr = bus_to_context_maddr(iommu, bus);
+    context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
+    context = &context_entries[devfn];
+
+    if ( !context_present(*context) )
+    {
+        spin_unlock(&iommu->lock);
+        unmap_vtd_domain_page(context_entries);
+        return 0;
+    }
+
+    context_clear_present(*context);
+    context_clear_entry(*context);
+    iommu_flush_cache_entry(context);
+
+    if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
+                                    (((u16)bus) << 8) | devfn,
+                                    DMA_CCMD_MASK_NOBIT, 0) )
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
+
+    spin_unlock(&iommu->lock);
+    unmap_vtd_domain_page(context_entries);
+
+    return 0;
+}
+
+static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
+{
+    struct acpi_drhd_unit *drhd;
+    int ret = 0;
+    u32 type;
+    u8 secbus, secdevfn;
+
+    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    if ( !drhd )
+        return -ENODEV;
+
+    type = pdev_type(bus, devfn);
+    switch ( type )
+    {
+    case DEV_TYPE_PCIe_BRIDGE:
+    case DEV_TYPE_PCI_BRIDGE:
+        break;
+
+    case DEV_TYPE_PCIe_ENDPOINT:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+        break;
+
+    case DEV_TYPE_PCI:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+        if ( ret )
+            break;
+
+        secbus = bus;
+        secdevfn = devfn;
+        /* dependent devices unmapping */
+        while ( bus2bridge[bus].map )
+        {
+            secbus = bus;
+            secdevfn = devfn;
+            devfn = bus2bridge[bus].devfn;
+            bus = bus2bridge[bus].bus;
+            ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+            if ( ret )
+                return ret;
+        }
+
+        if ( (secbus != bus) && (secdevfn != 0) )
+            ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
+        break;
+
+    default:
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+static int reassign_device_ownership(
+    struct domain *source,
+    struct domain *target,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *source_hd = domain_hvm_iommu(source);
+    struct pci_dev *pdev;
+    struct acpi_drhd_unit *drhd;
+    struct iommu *pdev_iommu;
+    int ret, found = 0;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev_by_domain(source, bus, devfn);
+
+    if (!pdev)
+        return -ENODEV;
+
+    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    if ( !drhd )
+        return -ENODEV;
+    pdev_iommu = drhd->iommu;
+    domain_context_unmap(source, bus, devfn);
+
+    ret = domain_context_mapping(target, bus, devfn);
+    if ( ret )
+        return ret;
+
+    list_move(&pdev->domain_list, &target->arch.pdev_list);
+    pdev->domain = target;
+
+    for_each_pdev ( source, pdev )
+    {
+        drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+        if ( drhd && drhd->iommu == pdev_iommu )
+        {
+            found = 1;
+            break;
+        }
+    }
+
+    if ( !found )
+        clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
+
+    return ret;
+}
+
+void iommu_domain_teardown(struct domain *d)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    if ( list_empty(&acpi_drhd_units) )
+        return;
+
+    spin_lock(&hd->mapping_lock);
+    iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
+    hd->pgd_maddr = 0;
+    spin_unlock(&hd->mapping_lock);
+
+    iommu_domid_release(d);
+}
+
+int intel_iommu_map_page(
+    struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *page = NULL, *pte = NULL;
+    u64 pg_maddr;
+    int pte_present;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+        return 0;
+
+    spin_lock(&hd->mapping_lock);
+
+    pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
+    if ( pg_maddr == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
+        return -ENOMEM;
+    }
+    page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
+    pte = page + (gfn & LEVEL_MASK);
+    pte_present = dma_pte_present(*pte);
+    dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
+    dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
+
+    /* Set the SNP on leaf page table if Snoop Control available */
+    if ( iommu_snoop )
+        dma_set_pte_snp(*pte);
+
+    iommu_flush_cache_entry(pte);
+    spin_unlock(&hd->mapping_lock);
+    unmap_vtd_domain_page(page);
+
+    /*
+     * No need pcideves_lock here because we have flush
+     * when assign/deassign device
+     */
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+
+        if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
+            continue;
+
+        if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
+                                   (paddr_t)gfn << PAGE_SHIFT_4K, 1,
+                                   !pte_present) )
+            iommu_flush_write_buffer(iommu);
+    }
+
+    return 0;
+}
+
+int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+        return 0;
+
+    dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
+
+    return 0;
+}
+
+static int iommu_prepare_rmrr_dev(struct domain *d,
+                                  struct acpi_rmrr_unit *rmrr,
+                                  u8 bus, u8 devfn)
+{
+    int ret = 0;
+    u64 base, end;
+    unsigned long base_pfn, end_pfn;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(rmrr->base_address < rmrr->end_address);
+
+    base = rmrr->base_address & PAGE_MASK_4K;
+    base_pfn = base >> PAGE_SHIFT_4K;
+    end = PAGE_ALIGN_4K(rmrr->end_address);
+    end_pfn = end >> PAGE_SHIFT_4K;
+
+    while ( base_pfn < end_pfn )
+    {
+        intel_iommu_map_page(d, base_pfn, base_pfn);
+        base_pfn++;
+    }
+
+    ret = domain_context_mapping(d, bus, devfn);
+
+    return ret;
+}
+
+static int intel_iommu_add_device(struct pci_dev *pdev)
+{
+    struct acpi_rmrr_unit *rmrr;
+    u16 bdf;
+    int ret, i;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
+    if ( ret )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "intel_iommu_add_device: context mapping failed\n");
+        return ret;
+    }
+
+    for_each_rmrr_device ( rmrr, bdf, i )
+    {
+        if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
+        {
+            ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
+                                         pdev->bus, pdev->devfn);
+            if ( ret )
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                         "intel_iommu_add_device: RMRR mapping failed\n");
+            break;
+        }
+    }
+
+    return ret;
+}
+
+static int intel_iommu_remove_device(struct pci_dev *pdev)
+{
+    struct acpi_rmrr_unit *rmrr;
+    u16 bdf;
+    int i;
+
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    /* If the device belongs to dom0, and it has RMRR, don't remove it
+     * from dom0, because BIOS may use RMRR at booting time.
+     */
+    if ( pdev->domain->domain_id == 0 )
+    {
+        for_each_rmrr_device ( rmrr, bdf, i )
+        {
+            if ( PCI_BUS(bdf) == pdev->bus &&
+                 PCI_DEVFN2(bdf) == pdev->devfn )
+                return 0;
+        }
+    }
+
+    return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
+}
+
+static void setup_dom0_devices(struct domain *d)
+{
+    struct hvm_iommu *hd;
+    struct pci_dev *pdev;
+    int bus, dev, func;
+    u32 l;
+
+    hd = domain_hvm_iommu(d);
+
+    spin_lock(&pcidevs_lock);
+    for ( bus = 0; bus < 256; bus++ )
+    {
+        for ( dev = 0; dev < 32; dev++ )
+        {
+            for ( func = 0; func < 8; func++ )
+            {
+                l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
+                /* some broken boards return 0 or ~0 if a slot is empty: */
+                if ( (l == 0xffffffff) || (l == 0x00000000) ||
+                     (l == 0x0000ffff) || (l == 0xffff0000) )
+                    continue;
+
+                pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
+                pdev->domain = d;
+                list_add(&pdev->domain_list, &d->arch.pdev_list);
+                domain_context_mapping(d, pdev->bus, pdev->devfn);
+            }
+        }
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+void clear_fault_bits(struct iommu *iommu)
+{
+    u64 val;
+
+    val = dmar_readq(
+        iommu->reg,
+        cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
+    dmar_writeq(
+        iommu->reg,
+        cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
+        val);
+    dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
+}
+
+static int init_vtd_hw(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct iommu_flush *flush = NULL;
+    int vector;
+    int ret;
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        ret = iommu_set_root_entry(iommu);
+        if ( ret )
+        {
+            gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
+            return -EIO;
+        }
+
+        vector = iommu_set_interrupt(iommu);
+        dma_msi_data_init(iommu, vector);
+        dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
+        iommu->vector = vector;
+        clear_fault_bits(iommu);
+        dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+
+        /* initialize flush functions */
+        flush = iommu_get_flush(iommu);
+        flush->context = flush_context_reg;
+        flush->iotlb = flush_iotlb_reg;
+    }
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        if ( qinval_setup(iommu) != 0 )
+            dprintk(XENLOG_INFO VTDPREFIX,
+                    "Queued Invalidation hardware not found\n");
+    }
+
+    if (iommu_intremap) {
+        for_each_drhd_unit ( drhd )
+        {
+            iommu = drhd->iommu;
+            if ( intremap_setup(iommu) != 0 )
+                dprintk(XENLOG_INFO VTDPREFIX,
+                        "Interrupt Remapping hardware not found\n");
+        }
+    }
+
+    return 0;
+}
+
+static void setup_dom0_rmrr(struct domain *d)
+{
+    struct acpi_rmrr_unit *rmrr;
+    u16 bdf;
+    int ret, i;
+
+    spin_lock(&pcidevs_lock);
+    for_each_rmrr_device ( rmrr, bdf, i )
+    {
+        ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+        if ( ret )
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                     "IOMMU: mapping reserved region failed\n");
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+static void platform_quirks(void)
+{
+    u32 id;
+
+    /* Mobile 4 Series Chipset neglects to set RWBF capability,
+     * but needs it
+     */
+    gdprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
+    id = pci_conf_read32(0, 0, 0, 0);
+    if ( id == 0x2a408086 )
+        rwbf_quirk = 1;
+}
+
+int intel_vtd_setup(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    if ( !vtd_enabled )
+        return -ENODEV;
+
+    platform_quirks();
+
+    spin_lock_init(&domid_bitmap_lock);
+    clflush_size = get_cache_line_size();
+
+    for_each_drhd_unit ( drhd )
+        if ( iommu_alloc(drhd) != 0 )
+            goto error;
+
+    /* Allocate IO page directory page for the domain. */
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* Allocate domain id bitmap, and set bit 0 as reserved */
+    domid_bitmap_size = cap_ndoms(iommu->cap);
+    domid_bitmap = xmalloc_array(unsigned long,
+                                 BITS_TO_LONGS(domid_bitmap_size));
+    if ( domid_bitmap == NULL )
+        goto error;
+    memset(domid_bitmap, 0, domid_bitmap_size / 8);
+    set_bit(0, domid_bitmap);
+
+    if ( init_vtd_hw() )
+        goto error;
+
+    /* Giving that all devices within guest use same io page table,
+     * enable snoop control only if all VT-d engines support it.
+     */
+    if ( iommu_snoop )
+    {
+        for_each_drhd_unit ( drhd )
+        {
+            iommu = drhd->iommu;
+            if ( !ecap_snp_ctl(iommu->ecap) ) {
+                iommu_snoop = 0;
+                break;
+            }
+        }
+    }
+
+    printk("Intel VT-d snoop control %sabled\n", iommu_snoop ? "en" : "dis");
+    return 0;
+
+ error:
+    for_each_drhd_unit ( drhd )
+        iommu_free(drhd);
+    vtd_enabled = 0;
+    iommu_snoop = 0;
+    return -ENOMEM;
+}
+
+/*
+ * If the device isn't owned by dom0, it means it already
+ * has been assigned to other domain, or it's not exist.
+ */
+int device_assigned(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+
+    spin_lock(&pcidevs_lock);
+    pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
+    if (!pdev)
+    {
+        spin_unlock(&pcidevs_lock);
+        return -1;
+    }
+
+    spin_unlock(&pcidevs_lock);
+    return 0;
+}
+
+int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct acpi_rmrr_unit *rmrr;
+    int ret = 0, i;
+    struct pci_dev *pdev;
+    u16 bdf;
+
+    if ( list_empty(&acpi_drhd_units) )
+        return -ENODEV;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(bus, devfn);
+    if (!pdev)
+        return -ENODEV;
+
+    if (pdev->domain != dom0)
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: assign a assigned device\n");
+       return -EBUSY;
+    }
+
+    ret = reassign_device_ownership(dom0, d, bus, devfn);
+    if ( ret )
+        goto done;
+
+    /* Setup rmrr identity mapping */
+    for_each_rmrr_device( rmrr, bdf, i )
+    {
+        if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
+        {
+            /* FIXME: Because USB RMRR conflicts with guest bios region,
+             * ignore USB RMRR temporarily.
+             */
+            if ( is_usb_device(bus, devfn) )
+            {
+                ret = 0;
+                goto done;
+            }
+
+            ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
+            if ( ret )
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                         "IOMMU: mapping reserved region failed\n");
+            goto done;
+        }
+    }
+
+done:
+    return ret;
+}
+
+static int intel_iommu_group_id(u8 bus, u8 devfn)
+{
+    u8 secbus;
+    if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
+        return PCI_BDF2(bus, devfn);
+    else
+        return -1;
+}
+
+static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
+void iommu_suspend(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    u32    i;
+
+    if ( !vtd_enabled )
+        return;
+
+    iommu_flush_all();
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        i = iommu->index;
+
+        iommu_state[i][DMAR_FECTL_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
+        iommu_state[i][DMAR_FEDATA_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
+        iommu_state[i][DMAR_FEADDR_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
+        iommu_state[i][DMAR_FEUADDR_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
+    }
+}
+
+void iommu_resume(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    u32 i;
+
+    if ( !vtd_enabled )
+        return;
+
+    iommu_flush_all();
+
+    if ( init_vtd_hw() != 0  && force_iommu )
+         panic("IOMMU setup failed, crash Xen for security purpose!\n");
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        i = iommu->index;
+
+        dmar_writel(iommu->reg, DMAR_FECTL_REG,
+                    (u32) iommu_state[i][DMAR_FECTL_REG]);
+        dmar_writel(iommu->reg, DMAR_FEDATA_REG,
+                    (u32) iommu_state[i][DMAR_FEDATA_REG]);
+        dmar_writel(iommu->reg, DMAR_FEADDR_REG,
+                    (u32) iommu_state[i][DMAR_FEADDR_REG]);
+        dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
+                    (u32) iommu_state[i][DMAR_FEUADDR_REG]);
+        iommu_enable_translation(iommu);
+    }
+}
+
+struct iommu_ops intel_iommu_ops = {
+    .init = intel_iommu_domain_init,
+    .add_device = intel_iommu_add_device,
+    .remove_device = intel_iommu_remove_device,
+    .assign_device  = intel_iommu_assign_device,
+    .teardown = iommu_domain_teardown,
+    .map_page = intel_iommu_map_page,
+    .unmap_page = intel_iommu_unmap_page,
+    .reassign_device = reassign_device_ownership,
+    .get_device_group_id = intel_iommu_group_id,
+    .update_ire_from_apic = io_apic_write_remap_rte,
+    .update_ire_from_msi = msi_msg_write_remap_rte,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -Naurp xen/drivers/passthrough/vtd/iommu.h xen-redhat/drivers/passthrough/vtd/iommu.h
--- xen/drivers/passthrough/vtd/iommu.h
+++ xen-redhat/drivers/passthrough/vtd/iommu.h
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <xen/types.h>
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+
+#define    DMAR_VER_REG    0x0    /* Arch version supported by this IOMMU */
+#define    DMAR_CAP_REG    0x8    /* Hardware supported capabilities */
+#define    DMAR_ECAP_REG    0x10    /* Extended capabilities supported */
+#define    DMAR_GCMD_REG    0x18    /* Global command register */
+#define    DMAR_GSTS_REG    0x1c    /* Global status register */
+#define    DMAR_RTADDR_REG    0x20    /* Root entry table */
+#define    DMAR_CCMD_REG    0x28    /* Context command reg */
+#define    DMAR_FSTS_REG    0x34    /* Fault Status register */
+#define    DMAR_FECTL_REG    0x38    /* Fault control register */
+#define    DMAR_FEDATA_REG    0x3c    /* Fault event interrupt data register */
+#define    DMAR_FEADDR_REG    0x40    /* Fault event interrupt addr register */
+#define    DMAR_FEUADDR_REG 0x44    /* Upper address register */
+#define    DMAR_AFLOG_REG    0x58    /* Advanced Fault control */
+#define    DMAR_PMEN_REG    0x64    /* Enable Protected Memory Region */
+#define    DMAR_PLMBASE_REG 0x68    /* PMRR Low addr */
+#define    DMAR_PLMLIMIT_REG 0x6c    /* PMRR low limit */
+#define    DMAR_PHMBASE_REG 0x70    /* pmrr high base addr */
+#define    DMAR_PHMLIMIT_REG 0x78    /* pmrr high limit */
+#define    DMAR_IQH_REG    0x80    /* invalidation queue head */
+#define    DMAR_IQT_REG    0x88    /* invalidation queue tail */
+#define    DMAR_IQA_REG    0x90    /* invalidation queue addr */
+#define    DMAR_IRTA_REG   0xB8    /* intr remap */
+
+#define OFFSET_STRIDE        (9)
+#define dmar_readl(dmar, reg) readl(dmar + reg)
+#define dmar_writel(dmar, reg, val) writel(val, dmar + reg)
+#define dmar_readq(dmar, reg) ({ \
+        u32 lo, hi; \
+        lo = dmar_readl(dmar, reg); \
+        hi = dmar_readl(dmar, reg + 4); \
+        (((u64) hi) << 32) + lo; })
+#define dmar_writeq(dmar, reg, val) do {\
+        dmar_writel(dmar, reg, (u32)val); \
+        dmar_writel(dmar, reg + 4, (u32)((u64) val >> 32)); \
+    } while (0)
+
+#define VER_MAJOR(v)        (((v) & 0xf0) >> 4)
+#define VER_MINOR(v)        ((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_read_drain(c)    (((c) >> 55) & 1)
+#define cap_write_drain(c)    (((c) >> 54) & 1)
+#define cap_max_amask_val(c)    (((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)    ((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)       (((c) >> 39) & 1)
+
+#define cap_super_page_val(c)    (((c) >> 34) & 0xf)
+#define cap_super_offset(c)    (((find_first_bit(&cap_super_page_val(c), 4)) \
+                    * OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)    ((((c) >> 24) & 0x3ff) * 16)
+
+#define cap_isoch(c)        (((c) >> 23) & 1)
+#define cap_qos(c)        (((c) >> 22) & 1)
+#define cap_mgaw(c)        ((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)        (((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)    (((c) >> 7) & 1)
+#define cap_phmr(c)        (((c) >> 6) & 1)
+#define cap_plmr(c)        (((c) >> 5) & 1)
+#define cap_rwbf(c)        (((c) >> 4) & 1)
+#define cap_afl(c)        (((c) >> 3) & 1)
+#define cap_ndoms(c)        (1 << (4 + 2 * ((c) & 0x7)))
+
+/*
+ * Extended Capability Register
+ */
+
+#define ecap_niotlb_iunits(e)    ((((e) >> 24) & 0xff) + 1)
+#define ecap_iotlb_offset(e)     ((((e) >> 8) & 0x3ff) * 16)
+#define ecap_coherent(e)         ((e >> 0) & 0x1)
+#define ecap_queued_inval(e)     ((e >> 1) & 0x1)
+#define ecap_dev_iotlb(e)        ((e >> 2) & 0x1)
+#define ecap_intr_remap(e)       ((e >> 3) & 0x1)
+#define ecap_ext_intr(e)         ((e >> 4) & 0x1)
+#define ecap_cache_hints(e)      ((e >> 5) & 0x1)
+#define ecap_pass_thru(e)        ((e >> 6) & 0x1)
+#define ecap_snp_ctl(e)          ((e >> 7) & 0x1)
+
+/* IOTLB_REG */
+#define DMA_TLB_FLUSH_GRANU_OFFSET  60
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(x) (((x) >> 60) & 7)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
+#define DMA_TLB_DID(x) (((u64)(x & 0xffff)) << 32)
+
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_IVT (((u64)1) << 63)
+
+#define DMA_TLB_IVA_ADDR(x) ((((u64)x) >> 12) << 12)
+#define DMA_TLB_IVA_HINT(x) ((((u64)x) & 1) << 6)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE     (((u64)1) << 31)
+#define DMA_GCMD_SRTP   (((u64)1) << 30)
+#define DMA_GCMD_SFL    (((u64)1) << 29)
+#define DMA_GCMD_EAFL   (((u64)1) << 28)
+#define DMA_GCMD_WBF    (((u64)1) << 27)
+#define DMA_GCMD_QIE    (((u64)1) << 26)
+#define DMA_GCMD_IRE    (((u64)1) << 25)
+#define DMA_GCMD_SIRTP  (((u64)1) << 24)
+#define DMA_GCMD_CFI    (((u64)1) << 23)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES    (((u64)1) << 31)
+#define DMA_GSTS_RTPS   (((u64)1) << 30)
+#define DMA_GSTS_FLS    (((u64)1) << 29)
+#define DMA_GSTS_AFLS   (((u64)1) << 28)
+#define DMA_GSTS_WBFS   (((u64)1) << 27)
+#define DMA_GSTS_QIES   (((u64)1) <<26)
+#define DMA_GSTS_IRES   (((u64)1) <<25)
+#define DMA_GSTS_SIRTPS (((u64)1) << 24)
+#define DMA_GSTS_CFIS   (((u64)1) <<23)
+
+/* PMEN_REG */
+#define DMA_PMEN_EPM    (((u32)1) << 31)
+#define DMA_PMEN_PRS    (((u32)1) << 0)
+
+/* CCMD_REG */
+#define DMA_CCMD_INVL_GRANU_OFFSET  61
+#define DMA_CCMD_ICC   (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_CIRG(x) ((((u64)3) << 61) & x)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+#define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u64)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PFO ((u64)1 << 0)
+#define DMA_FSTS_PPF ((u64)1 << 1)
+#define DMA_FSTS_AFO ((u64)1 << 2)
+#define DMA_FSTS_APF ((u64)1 << 3)
+#define DMA_FSTS_IQE ((u64)1 << 4)
+#define DMA_FSTS_ICE ((u64)1 << 5)
+#define DMA_FSTS_ITE ((u64)1 << 6)
+#define DMA_FSTS_FAULTS    DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u64)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+    u64    val;
+    u64    rsvd1;
+};
+#define root_present(root)    ((root).val & 1)
+#define set_root_present(root) do {(root).val |= 1;} while(0)
+#define get_context_addr(root) ((root).val & PAGE_MASK_4K)
+#define set_root_value(root, value) \
+    do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
+
+struct context_entry {
+    u64 lo;
+    u64 hi;
+};
+#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while(0)
+#define context_clear_present(c) do {(c).lo &= ~1;} while(0)
+#define context_set_fault_enable(c) \
+    do {(c).lo &= (((u64)-1) << 2) | 1;} while(0)
+
+#define context_set_translation_type(c, val) do { \
+        (c).lo &= (((u64)-1) << 4) | 3; \
+        (c).lo |= (val & 3) << 2; \
+    } while(0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define CONTEXT_TT_DEV_IOTLB   1
+#define CONTEXT_TT_PASS_THRU   2
+
+#define context_set_address_root(c, val) \
+    do {(c).lo &= 0xfff; (c).lo |= (val) & PAGE_MASK_4K ;} while(0)
+#define context_set_address_width(c, val) \
+    do {(c).hi &= 0xfffffff8; (c).hi |= (val) & 7;} while(0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while(0)
+
+/* page table handling */
+#define LEVEL_STRIDE       (9)
+#define LEVEL_MASK         ((1 << LEVEL_STRIDE) - 1)
+#define PTE_NUM            (1 << LEVEL_STRIDE)
+#define level_to_agaw(val) ((val) - 2)
+#define agaw_to_level(val) ((val) + 2)
+#define agaw_to_width(val) (30 + val * LEVEL_STRIDE)
+#define width_to_agaw(w)   ((w - 30)/LEVEL_STRIDE)
+#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE)
+#define address_level_offset(addr, level) \
+            ((addr >> level_to_offset_bits(level)) & LEVEL_MASK)
+#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l))
+#define level_size(l) (1 << level_to_offset_bits(l))
+#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l))
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+    u64 val;
+};
+#define DMA_PTE_READ (1)
+#define DMA_PTE_WRITE (2)
+#define DMA_PTE_SNP  (1 << 11)
+
+#define dma_clear_pte(p)    do {(p).val = 0;} while(0)
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0)
+#define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0)
+#define dma_set_pte_snp(p)  do {(p).val |= DMA_PTE_SNP;} while(0)
+
+#define dma_set_pte_prot(p, prot) \
+            do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_set_pte_addr(p, addr) do {\
+            (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
+/* interrupt remap entry */
+struct iremap_entry {
+  union {
+    u64 lo_val;
+    struct {
+        u64 p       : 1,
+            fpd     : 1,
+            dm      : 1,
+            rh      : 1,
+            tm      : 1,
+            dlm     : 3,
+            avail   : 4,
+            res_1   : 4,
+            vector  : 8,
+            res_2   : 8,
+            dst     : 32;
+    }lo;
+  };
+  union {
+    u64 hi_val;
+    struct {
+        u64 sid     : 16,
+            sq      : 2,
+            svt     : 2,
+            res_1   : 44;
+    }hi;
+  };
+};
+
+/* Max intr remapping table page order is 8, as max number of IRTEs is 64K */
+#define IREMAP_PAGE_ORDER  8
+
+/*
+ * VTd engine handles 4K page, while CPU may have different page size on
+ * different arch. E.g. 16K on IPF.
+ */
+#define IREMAP_ARCH_PAGE_ORDER  (IREMAP_PAGE_ORDER + PAGE_SHIFT_4K - PAGE_SHIFT)
+#define IREMAP_ARCH_PAGE_NR     ( IREMAP_ARCH_PAGE_ORDER < 0 ?  \
+                                1 :                             \
+                                1 << IREMAP_ARCH_PAGE_ORDER )
+
+/* Each entry is 16 bytes, so 2^8 entries per 4K page */
+#define IREMAP_ENTRY_ORDER  ( PAGE_SHIFT - 4 )
+#define IREMAP_ENTRY_NR     ( 1 << ( IREMAP_PAGE_ORDER + 8 ) )
+
+#define iremap_present(v) ((v).lo & 1)
+#define iremap_fault_disable(v) (((v).lo >> 1) & 1)
+
+#define iremap_set_present(v) do {(v).lo |= 1;} while(0)
+#define iremap_clear_present(v) do {(v).lo &= ~1;} while(0)
+
+/*
+ * Get the intr remap entry:
+ * maddr   - machine addr of the table
+ * index   - index of the entry
+ * entries - return addr of the page holding this entry, need unmap it
+ * entry   - return required entry
+ */
+#define GET_IREMAP_ENTRY(maddr, index, entries, entry)                        \
+do {                                                                          \
+    entries = (struct iremap_entry *)map_vtd_domain_page(                     \
+              (maddr) + (( (index) >> IREMAP_ENTRY_ORDER ) << PAGE_SHIFT ) ); \
+    entry = &entries[(index) % (1 << IREMAP_ENTRY_ORDER)];                    \
+} while(0)
+
+/* queue invalidation entry */
+struct qinval_entry {
+    union {
+        struct {
+            u64 lo;
+            u64 hi;
+        }val;
+        struct {
+            struct {
+                u64 type    : 4,
+                    granu   : 2,
+                    res_1   : 10,
+                    did     : 16,
+                    sid     : 16,
+                    fm      : 2,
+                    res_2   : 14;
+            }lo;
+            struct {
+                u64 res;
+            }hi;
+        }cc_inv_dsc;
+        struct {
+            struct {
+                u64 type    : 4,
+                    granu   : 2,
+                    dw      : 1,
+                    dr      : 1,
+                    res_1   : 8,
+                    did     : 16,
+                    res_2   : 32;
+            }lo;
+            struct {
+                u64 am      : 6,
+                    ih      : 1,
+                    res_1   : 5,
+                    addr    : 52;
+            }hi;
+        }iotlb_inv_dsc;
+        struct {
+            struct {
+                u64 type    : 4,
+                    res_1   : 12,
+                    max_invs_pend: 5,
+                    res_2   : 11,
+                    sid     : 16,
+                    res_3   : 16;
+            }lo;
+            struct {
+                u64 size    : 1,
+                    res_1   : 11,
+                    addr    : 52;
+            }hi;
+        }dev_iotlb_inv_dsc;
+        struct {
+            struct {
+                u64 type    : 4,
+                    granu   : 1,
+                    res_1   : 22,
+                    im      : 5,
+                    iidx    : 16,
+                    res_2   : 16;
+            }lo;
+            struct {
+                u64 res;
+            }hi;
+        }iec_inv_dsc;
+        struct {
+            struct {
+                u64 type    : 4,
+                    iflag   : 1,
+                    sw      : 1,
+                    fn      : 1,
+                    res_1   : 25,
+                    sdata   : 32;
+            }lo;
+            struct {
+                u64 res_1   : 2,
+                    saddr   : 62;
+            }hi;
+        }inv_wait_dsc;
+    }q;
+};
+
+struct poll_info {
+    u64 saddr;
+    u32 udata;
+};
+
+#define QINVAL_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct qinval_entry))
+#define qinval_present(v) ((v).lo & 1)
+#define qinval_fault_disable(v) (((v).lo >> 1) & 1)
+
+#define qinval_set_present(v) do {(v).lo |= 1;} while(0)
+#define qinval_clear_present(v) do {(v).lo &= ~1;} while(0)
+
+#define RESERVED_VAL        0
+
+#define TYPE_INVAL_CONTEXT      0x1
+#define TYPE_INVAL_IOTLB        0x2
+#define TYPE_INVAL_DEVICE_IOTLB 0x3
+#define TYPE_INVAL_IEC          0x4
+#define TYPE_INVAL_WAIT         0x5
+
+#define NOTIFY_TYPE_POLL        1
+#define NOTIFY_TYPE_INTR        1
+#define INTERRUTP_FLAG          1
+#define STATUS_WRITE            1
+#define FENCE_FLAG              1
+
+#define IEC_GLOBAL_INVL         0
+#define IEC_INDEX_INVL          1
+#define IRTA_REG_EIME_SHIFT     11
+#define IRTA_REG_TABLE_SIZE     7    // 4k page = 256 * 16 byte entries
+                                     // 2^^(IRTA_REG_TABLE_SIZE + 1) = 256
+                                     // IRTA_REG_TABLE_SIZE = 7
+
+#define VTD_PAGE_TABLE_LEVEL_3  3
+#define VTD_PAGE_TABLE_LEVEL_4  4
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+#define MAX_IOMMU_REGS 0xc0
+
+extern struct list_head acpi_drhd_units;
+extern struct list_head acpi_rmrr_units;
+extern struct list_head acpi_ioapic_units;
+
+struct qi_ctrl {
+    u64 qinval_maddr;  /* queue invalidation page machine address */
+    int qinval_index;                    /* queue invalidation index */
+    spinlock_t qinval_lock;      /* lock for queue invalidation page */
+    spinlock_t qinval_poll_lock; /* lock for queue invalidation poll addr */
+    volatile u32 qinval_poll_status;     /* used by poll methord to sync */
+};
+
+struct ir_ctrl {
+    u64 iremap_maddr;            /* interrupt remap table machine address */
+    int iremap_num;              /* total num of used interrupt remap entry */
+    spinlock_t iremap_lock;      /* lock for irq remappping table */
+};
+
+struct iommu_flush {
+    int (*context)(void *iommu, u16 did, u16 source_id,
+                   u8 function_mask, u64 type, int non_present_entry_flush);
+    int (*iotlb)(void *iommu, u16 did, u64 addr, unsigned int size_order,
+                 u64 type, int non_present_entry_flush);
+};
+
+struct intel_iommu {
+    struct qi_ctrl qi_ctrl;
+    struct ir_ctrl ir_ctrl;
+    struct iommu_flush flush;
+};
+
+#endif
diff -Naurp xen/drivers/passthrough/vtd/Makefile xen-redhat/drivers/passthrough/vtd/Makefile
--- xen/drivers/passthrough/vtd/Makefile
+++ xen-redhat/drivers/passthrough/vtd/Makefile
@@ -0,0 +1,8 @@
+subdir-$(x86_32) += x86
+subdir-$(x86_64) += x86
+
+obj-y += iommu.o
+obj-y += dmar.o
+obj-y += utils.o
+obj-y += qinval.o
+obj-y += intremap.o
diff -Naurp xen/drivers/passthrough/vtd/qinval.c xen-redhat/drivers/passthrough/vtd/qinval.c
--- xen/drivers/passthrough/vtd/qinval.c
+++ xen-redhat/drivers/passthrough/vtd/qinval.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com>
+ */
+
+
+#include <xen/sched.h>
+#include <xen/iommu.h>
+#include <xen/time.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include "iommu.h"
+#include "dmar.h"
+#include "vtd.h"
+#include "extern.h"
+
+static void print_qi_regs(struct iommu *iommu)
+{
+    u64 val;
+
+    val = dmar_readq(iommu->reg, DMAR_IQA_REG);
+    printk("DMAR_IQA_REG = %"PRIx64"\n", val);
+
+    val = dmar_readq(iommu->reg, DMAR_IQH_REG);
+    printk("DMAR_IQH_REG = %"PRIx64"\n", val);
+
+    val = dmar_readq(iommu->reg, DMAR_IQT_REG);
+    printk("DMAR_IQT_REG = %"PRIx64"\n", val);
+}
+
+static int qinval_next_index(struct iommu *iommu)
+{
+    u64 val;
+    val = dmar_readq(iommu->reg, DMAR_IQT_REG);
+    return (val >> 4);
+}
+
+static int qinval_update_qtail(struct iommu *iommu, int index)
+{
+    u64 val;
+
+    /* Need an ASSERT to insure that we have got register lock */
+    val = (index < (QINVAL_ENTRY_NR-1)) ? (index + 1) : 0;
+    dmar_writeq(iommu->reg, DMAR_IQT_REG, (val << 4));
+    return 0;
+}
+
+static int gen_cc_inv_dsc(struct iommu *iommu, int index,
+    u16 did, u16 source_id, u8 function_mask, u8 granu)
+{
+    unsigned long flags;
+    struct qinval_entry *qinval_entry = NULL, *qinval_entries;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    spin_lock_irqsave(&qi_ctrl->qinval_lock, flags);
+    qinval_entries =
+        (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr);
+    qinval_entry = &qinval_entries[index];
+    qinval_entry->q.cc_inv_dsc.lo.type = TYPE_INVAL_CONTEXT;
+    qinval_entry->q.cc_inv_dsc.lo.granu = granu;
+    qinval_entry->q.cc_inv_dsc.lo.res_1 = 0;
+    qinval_entry->q.cc_inv_dsc.lo.did = did;
+    qinval_entry->q.cc_inv_dsc.lo.sid = source_id;
+    qinval_entry->q.cc_inv_dsc.lo.fm = function_mask;
+    qinval_entry->q.cc_inv_dsc.lo.res_2 = 0;
+    qinval_entry->q.cc_inv_dsc.hi.res = 0;
+
+    unmap_vtd_domain_page(qinval_entries);
+    spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags);
+
+    return 0;
+}
+
+int queue_invalidate_context(struct iommu *iommu,
+    u16 did, u16 source_id, u8 function_mask, u8 granu)
+{
+    int ret = -1;
+    unsigned long flags;
+    int index = -1;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    index = qinval_next_index(iommu);
+    if ( index == -1 )
+        return -EBUSY;
+    ret = gen_cc_inv_dsc(iommu, index, did, source_id,
+                         function_mask, granu);
+    ret |= qinval_update_qtail(iommu, index);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return ret;
+}
+
+static int gen_iotlb_inv_dsc(struct iommu *iommu, int index,
+    u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr)
+{
+    unsigned long flags;
+    struct qinval_entry *qinval_entry = NULL, *qinval_entries;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    if ( index == -1 )
+        return -1;
+    spin_lock_irqsave(&qi_ctrl->qinval_lock, flags);
+
+    qinval_entries =
+        (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr);
+    qinval_entry = &qinval_entries[index];
+    qinval_entry->q.iotlb_inv_dsc.lo.type = TYPE_INVAL_IOTLB;
+    qinval_entry->q.iotlb_inv_dsc.lo.granu = granu;
+    qinval_entry->q.iotlb_inv_dsc.lo.dr = 0;
+    qinval_entry->q.iotlb_inv_dsc.lo.dw = 0;
+    qinval_entry->q.iotlb_inv_dsc.lo.res_1 = 0;
+    qinval_entry->q.iotlb_inv_dsc.lo.did = did;
+    qinval_entry->q.iotlb_inv_dsc.lo.res_2 = 0;
+
+    qinval_entry->q.iotlb_inv_dsc.hi.am = am;
+    qinval_entry->q.iotlb_inv_dsc.hi.ih = ih;
+    qinval_entry->q.iotlb_inv_dsc.hi.res_1 = 0;
+    qinval_entry->q.iotlb_inv_dsc.hi.addr = addr;
+
+    unmap_vtd_domain_page(qinval_entries);
+    spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags);
+    return 0;
+}
+
+int queue_invalidate_iotlb(struct iommu *iommu,
+    u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr)
+{
+    int ret = -1;
+    unsigned long flags;
+    int index = -1;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+
+    index = qinval_next_index(iommu);
+    ret = gen_iotlb_inv_dsc(iommu, index, granu, dr, dw, did,
+                            am, ih, addr);
+    ret |= qinval_update_qtail(iommu, index);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return ret;
+}
+
+static int gen_wait_dsc(struct iommu *iommu, int index,
+    u8 iflag, u8 sw, u8 fn, u32 sdata, volatile u32 *saddr)
+{
+    unsigned long flags;
+    struct qinval_entry *qinval_entry = NULL, *qinval_entries;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    if ( index == -1 )
+        return -1;
+    spin_lock_irqsave(&qi_ctrl->qinval_lock, flags);
+    qinval_entries =
+        (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr);
+    qinval_entry = &qinval_entries[index];
+    qinval_entry->q.inv_wait_dsc.lo.type = TYPE_INVAL_WAIT;
+    qinval_entry->q.inv_wait_dsc.lo.iflag = iflag;
+    qinval_entry->q.inv_wait_dsc.lo.sw = sw;
+    qinval_entry->q.inv_wait_dsc.lo.fn = fn;
+    qinval_entry->q.inv_wait_dsc.lo.res_1 = 0;
+    qinval_entry->q.inv_wait_dsc.lo.sdata = sdata;
+    qinval_entry->q.inv_wait_dsc.hi.res_1 = 0;
+    qinval_entry->q.inv_wait_dsc.hi.saddr = virt_to_maddr(saddr) >> 2;
+    unmap_vtd_domain_page(qinval_entries);
+    spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags);
+    return 0;
+}
+
+static int queue_invalidate_wait(struct iommu *iommu,
+    u8 iflag, u8 sw, u8 fn, u32 sdata, volatile u32 *saddr)
+{
+    unsigned long flags;
+    s_time_t start_time;
+    int index = -1;
+    int ret = -1;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    spin_lock_irqsave(&qi_ctrl->qinval_poll_lock, flags);
+    spin_lock(&iommu->register_lock);
+    index = qinval_next_index(iommu);
+    if ( *saddr == 1 )
+        *saddr = 0;
+    ret = gen_wait_dsc(iommu, index, iflag, sw, fn, sdata, saddr);
+    ret |= qinval_update_qtail(iommu, index);
+    spin_unlock(&iommu->register_lock);
+
+    /* Now we don't support interrupt method */
+    if ( sw )
+    {
+        /* In case all wait descriptor writes to same addr with same data */
+        start_time = NOW();
+        while ( *saddr != 1 )
+        {
+            if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            {
+                print_qi_regs(iommu);
+                panic("queue invalidate wait descriptor was not executed\n");
+            }
+            cpu_relax();
+        }
+    }
+    spin_unlock_irqrestore(&qi_ctrl->qinval_poll_lock, flags);
+    return ret;
+}
+
+int invalidate_sync(struct iommu *iommu)
+{
+    int ret = -1;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    if ( qi_ctrl->qinval_maddr != 0 )
+    {
+        ret = queue_invalidate_wait(iommu,
+            0, 1, 1, 1, &qi_ctrl->qinval_poll_status);
+        return ret;
+    }
+    return 0;
+}
+
+static int gen_dev_iotlb_inv_dsc(struct iommu *iommu, int index,
+    u32 max_invs_pend, u16 sid, u16 size, u64 addr)
+{
+    unsigned long flags;
+    struct qinval_entry *qinval_entry = NULL, *qinval_entries;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    if ( index == -1 )
+        return -1;
+    spin_lock_irqsave(&qi_ctrl->qinval_lock, flags);
+
+    qinval_entries =
+        (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr);
+    qinval_entry = &qinval_entries[index];
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.type = TYPE_INVAL_DEVICE_IOTLB;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.res_1 = 0;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.max_invs_pend = max_invs_pend;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.res_2 = 0;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.sid = sid;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.res_3 = 0;
+
+    qinval_entry->q.dev_iotlb_inv_dsc.hi.size = size;
+    qinval_entry->q.dev_iotlb_inv_dsc.hi.res_1 = 0;
+    qinval_entry->q.dev_iotlb_inv_dsc.hi.addr = addr >> PAGE_SHIFT_4K;
+
+    unmap_vtd_domain_page(qinval_entries);
+    spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags);
+    return 0;
+}
+
+int qinval_device_iotlb(struct iommu *iommu,
+    u32 max_invs_pend, u16 sid, u16 size, u64 addr)
+{
+    int ret = -1;
+    unsigned long flags;
+    int index = -1;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    index = qinval_next_index(iommu);
+    ret = gen_dev_iotlb_inv_dsc(iommu, index, max_invs_pend,
+                                sid, size, addr);
+    ret |= qinval_update_qtail(iommu, index);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return ret;
+}
+
+static int gen_iec_inv_dsc(struct iommu *iommu, int index,
+    u8 granu, u8 im, u16 iidx)
+{
+    unsigned long flags;
+    struct qinval_entry *qinval_entry = NULL, *qinval_entries;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    if ( index == -1 )
+        return -1;
+    spin_lock_irqsave(&qi_ctrl->qinval_lock, flags);
+
+    qinval_entries =
+        (struct qinval_entry *)map_vtd_domain_page(qi_ctrl->qinval_maddr);
+    qinval_entry = &qinval_entries[index];
+    qinval_entry->q.iec_inv_dsc.lo.type = TYPE_INVAL_IEC;
+    qinval_entry->q.iec_inv_dsc.lo.granu = granu;
+    qinval_entry->q.iec_inv_dsc.lo.res_1 = 0;
+    qinval_entry->q.iec_inv_dsc.lo.im = im;
+    qinval_entry->q.iec_inv_dsc.lo.iidx = iidx;
+    qinval_entry->q.iec_inv_dsc.lo.res_2 = 0;
+    qinval_entry->q.iec_inv_dsc.hi.res = 0;
+
+    unmap_vtd_domain_page(qinval_entries);
+    spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags);
+    return 0;
+}
+
+int queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
+{
+    int ret;
+    unsigned long flags;
+    int index = -1;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    index = qinval_next_index(iommu);
+    ret = gen_iec_inv_dsc(iommu, index, granu, im, iidx);
+    ret |= qinval_update_qtail(iommu, index);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return ret;
+}
+
+int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
+{
+    int ret;
+    ret = queue_invalidate_iec(iommu, granu, im, iidx);
+    ret |= invalidate_sync(iommu);
+
+    /*
+     * reading vt-d architecture register will ensure
+     * draining happens in implementation independent way.
+     */
+    (void)dmar_readq(iommu->reg, DMAR_CAP_REG);
+    return ret;
+}
+
+int iommu_flush_iec_global(struct iommu *iommu)
+{
+    return __iommu_flush_iec(iommu, IEC_GLOBAL_INVL, 0, 0);
+}
+
+int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx)
+{
+   return __iommu_flush_iec(iommu, IEC_INDEX_INVL, im, iidx);
+}
+
+static int flush_context_qi(
+    void *_iommu, u16 did, u16 sid, u8 fm, u64 type,
+    int non_present_entry_flush)
+{
+    int ret = 0;
+    struct iommu *iommu = (struct iommu *)_iommu;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    /*
+     * In the non-present entry flush case, if hardware doesn't cache
+     * non-present entry we do nothing and if hardware cache non-present
+     * entry, we flush entries of domain 0 (the domain id is used to cache
+     * any non-present entries)
+     */
+    if ( non_present_entry_flush )
+    {
+        if ( !cap_caching_mode(iommu->cap) )
+            return 1;
+        else
+            did = 0;
+    }
+
+    if ( qi_ctrl->qinval_maddr != 0 )
+    {
+        ret = queue_invalidate_context(iommu, did, sid, fm,
+                                       type >> DMA_CCMD_INVL_GRANU_OFFSET);
+        ret |= invalidate_sync(iommu);
+    }
+    return ret;
+}
+
+static int flush_iotlb_qi(
+    void *_iommu, u16 did,
+    u64 addr, unsigned int size_order, u64 type,
+    int non_present_entry_flush)
+{
+    u8 dr = 0, dw = 0;
+    int ret = 0;
+    struct iommu *iommu = (struct iommu *)_iommu;
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+
+    /*
+     * In the non-present entry flush case, if hardware doesn't cache
+     * non-present entry we do nothing and if hardware cache non-present
+     * entry, we flush entries of domain 0 (the domain id is used to cache
+     * any non-present entries)
+     */
+    if ( non_present_entry_flush )
+    {
+        if ( !cap_caching_mode(iommu->cap) )
+            return 1;
+        else
+            did = 0;
+    }
+
+    if ( qi_ctrl->qinval_maddr != 0 )
+    {
+        /* use queued invalidation */
+        if (cap_write_drain(iommu->cap))
+            dw = 1;
+        if (cap_read_drain(iommu->cap))
+            dr = 1;
+        /* Need to conside the ih bit later */
+        ret = queue_invalidate_iotlb(iommu,
+                  (type >> DMA_TLB_FLUSH_GRANU_OFFSET), dr,
+                  dw, did, (u8)size_order, 0, addr);
+        ret |= invalidate_sync(iommu);
+    }
+    return ret;
+}
+
+int qinval_setup(struct iommu *iommu)
+{
+    s_time_t start_time;
+    struct qi_ctrl *qi_ctrl;
+    struct iommu_flush *flush;
+
+    qi_ctrl = iommu_qi_ctrl(iommu);
+    flush = iommu_get_flush(iommu);
+
+    if ( !ecap_queued_inval(iommu->ecap) )
+        return -ENODEV;
+
+    if ( qi_ctrl->qinval_maddr == 0 )
+    {
+        qi_ctrl->qinval_maddr = alloc_pgtable_maddr(NULL);
+        if ( qi_ctrl->qinval_maddr == 0 )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Cannot allocate memory for qi_ctrl->qinval_maddr\n");
+            return -ENOMEM;
+        }
+        flush->context = flush_context_qi;
+        flush->iotlb = flush_iotlb_qi;
+    }
+
+    /* Setup Invalidation Queue Address(IQA) register with the
+     * address of the page we just allocated.  QS field at
+     * bits[2:0] to indicate size of queue is one 4KB page.
+     * That's 256 entries.  Queued Head (IQH) and Queue Tail (IQT)
+     * registers are automatically reset to 0 with write
+     * to IQA register.
+     */
+    dmar_writeq(iommu->reg, DMAR_IQA_REG, qi_ctrl->qinval_maddr);
+
+    /* enable queued invalidation hardware */
+    iommu->gcmd |= DMA_GCMD_QIE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) )
+    {
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            panic("Cannot set QIE field for queue invalidation\n");
+        cpu_relax();
+    }
+
+    return 0;
+}
diff -Naurp xen/drivers/passthrough/vtd/utils.c xen-redhat/drivers/passthrough/vtd/utils.c
--- xen/drivers/passthrough/vtd/utils.c
+++ xen-redhat/drivers/passthrough/vtd/utils.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ */
+
+#include <xen/sched.h>
+#include <xen/delay.h>
+#include <xen/iommu.h>
+#include <xen/time.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include "iommu.h"
+#include "dmar.h"
+#include "vtd.h"
+#include "extern.h"
+
+int is_usb_device(u8 bus, u8 devfn)
+{
+    u16 class = pci_conf_read16(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                                PCI_CLASS_DEVICE);
+    return (class == 0xc03);
+}
+
+/* Disable vt-d protected memory registers. */
+void disable_pmr(struct iommu *iommu)
+{
+    s_time_t start_time;
+    unsigned int val;
+
+    val = dmar_readl(iommu->reg, DMAR_PMEN_REG);
+    if ( !(val & DMA_PMEN_PRS) )
+        return;
+
+    dmar_writel(iommu->reg, DMAR_PMEN_REG, val & ~DMA_PMEN_EPM);
+    start_time = NOW();
+
+    for ( ; ; )
+    {
+        val = dmar_readl(iommu->reg, DMAR_PMEN_REG);
+        if ( (val & DMA_PMEN_PRS) == 0 )
+            break;
+
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("Disable PMRs timeout\n");
+
+        cpu_relax();
+    }
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+            "Disabled protected memory registers\n");
+}
+
+void print_iommu_regs(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+
+    printk("---- print_iommu_regs ----\n");
+    printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address);
+    printk("print_iommu_regs: DMAR_VER_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_VER_REG));
+    printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_CAP_REG));
+    printk("print_iommu_regs: n_fault_reg = %"PRIx64"\n",
+           cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_l = %"PRIx64"\n",
+           cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_h = %"PRIx64"\n",
+           cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8);
+    printk("print_iommu_regs: fault_recording_reg_l = %"PRIx64"\n",
+           dmar_readq(iommu->reg,
+               cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG))));
+    printk("print_iommu_regs: fault_recording_reg_h = %"PRIx64"\n",
+           dmar_readq(iommu->reg,
+               cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8));
+    printk("print_iommu_regs: DMAR_ECAP_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_ECAP_REG));
+    printk("print_iommu_regs: DMAR_GCMD_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_GCMD_REG));
+    printk("print_iommu_regs: DMAR_GSTS_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_GSTS_REG));
+    printk("print_iommu_regs: DMAR_RTADDR_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_RTADDR_REG));
+    printk("print_iommu_regs: DMAR_CCMD_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_CCMD_REG));
+    printk("print_iommu_regs: DMAR_FSTS_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FSTS_REG));
+    printk("print_iommu_regs: DMAR_FECTL_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FECTL_REG));
+    printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FEDATA_REG));
+    printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FEADDR_REG));
+    printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FEUADDR_REG));
+}
+
+u32 get_level_index(unsigned long gmfn, int level)
+{
+    while ( --level )
+        gmfn = gmfn >> LEVEL_STRIDE;
+
+    return gmfn & LEVEL_MASK;
+}
+
+void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn)
+{
+    struct context_entry *ctxt_entry;
+    struct root_entry *root_entry;
+    struct dma_pte pte;
+    u64 *l;
+    u32 l_index, level;
+
+    printk("print_vtd_entries: iommu = %p bdf = %x:%x:%x gmfn = %"PRIx64"\n",
+           iommu, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), gmfn);
+
+    if ( iommu->root_maddr == 0 )
+    {
+        printk("    iommu->root_maddr = 0\n");
+        return;
+    }
+
+    root_entry = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
+ 
+    printk("    root_entry = %p\n", root_entry);
+    printk("    root_entry[%x] = %"PRIx64"\n", bus, root_entry[bus].val);
+    if ( !root_present(root_entry[bus]) )
+    {
+        unmap_vtd_domain_page(root_entry);
+        printk("    root_entry[%x] not present\n", bus);
+        return;
+    }
+
+    ctxt_entry =
+        (struct context_entry *)map_vtd_domain_page(root_entry[bus].val);
+    if ( ctxt_entry == NULL )
+    {
+        unmap_vtd_domain_page(root_entry);
+        printk("    ctxt_entry == NULL\n");
+        return;
+    }
+
+    printk("    context = %p\n", ctxt_entry);
+    printk("    context[%x] = %"PRIx64"_%"PRIx64"\n",
+           devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo);
+    if ( !context_present(ctxt_entry[devfn]) )
+    {
+        unmap_vtd_domain_page(ctxt_entry);
+        unmap_vtd_domain_page(root_entry);
+        printk("    ctxt_entry[%x] not present\n", devfn);
+        return;
+    }
+
+    level = agaw_to_level(context_address_width(ctxt_entry[devfn]));
+    if ( level != VTD_PAGE_TABLE_LEVEL_3 &&
+         level != VTD_PAGE_TABLE_LEVEL_4)
+    {
+        unmap_vtd_domain_page(ctxt_entry);
+        unmap_vtd_domain_page(root_entry);
+        printk("Unsupported VTD page table level (%d)!\n", level);
+    }
+
+    l = maddr_to_virt(ctxt_entry[devfn].lo);
+    do
+    {
+        l = (u64*)(((unsigned long)l >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l%d = %p\n", level, l);
+        if ( l == NULL )
+        {
+            unmap_vtd_domain_page(ctxt_entry);
+            unmap_vtd_domain_page(root_entry);
+            printk("    l%d == NULL\n", level);
+            break;
+        }
+        l_index = get_level_index(gmfn, level);
+        printk("    l%d_index = %x\n", level, l_index);
+        printk("    l%d[%x] = %"PRIx64"\n", level, l_index, l[l_index]);
+
+        pte.val = l[l_index];
+        if ( !dma_pte_present(pte) )
+        {
+            unmap_vtd_domain_page(ctxt_entry);
+            unmap_vtd_domain_page(root_entry);
+            printk("    l%d[%x] not present\n", level, l_index);
+            break;
+        }
+
+        l = maddr_to_virt(l[l_index]);
+    } while ( --level );
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -Naurp xen/drivers/passthrough/vtd/vtd.h xen-redhat/drivers/passthrough/vtd/vtd.h
--- xen/drivers/passthrough/vtd/vtd.h
+++ xen-redhat/drivers/passthrough/vtd/vtd.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#ifndef _VTD_H_
+#define _VTD_H_
+
+#include <xen/iommu.h>
+
+/* Accomodate both IOAPIC and IOSAPIC. */
+struct IO_xAPIC_route_entry {
+    __u32   vector      :  8,
+        delivery_mode   :  3,   /* 000: FIXED
+                                 * 001: lowest prio
+                                 * 111: ExtINT
+                                 */
+        dest_mode       :  1,   /* 0: physical, 1: logical */
+        delivery_status :  1,
+        polarity        :  1,
+        irr             :  1,
+        trigger         :  1,   /* 0: edge, 1: level */
+        mask            :  1,   /* 0: enabled, 1: disabled */
+        __reserved_2    : 15;
+   
+    union {
+        struct { __u32
+            __reserved_1    : 24,
+            physical_dest   :  4,
+            __reserved_2    :  4;
+        } physical;
+
+        struct { __u32
+            __reserved_1    : 24,
+            logical_dest    :  8;
+        } logical;
+
+#ifdef __ia64__
+        struct { __u32
+            __reserved_1    : 16,
+            dest_id         : 16;
+        };
+#endif
+    } dest;
+
+} __attribute__ ((packed));
+
+struct IO_APIC_route_remap_entry {
+    union {
+        u64 val;
+        struct {
+            u64 vector:8,
+            delivery_mode:3,
+            index_15:1,
+            delivery_status:1,
+            polarity:1,
+            irr:1,
+            trigger:1,
+            mask:1,
+            reserved:31,
+            format:1,
+            index_0_14:15;
+        };
+    };
+};
+
+struct msi_msg_remap_entry {
+    union {
+        u32 val;
+        struct {
+            u32 dontcare:2,
+                index_15:1,
+                SHV:1,
+                format:1,
+                index_0_14:15,
+                addr_id_val:12; /* Interrupt address identifier value,
+                                   must be 0FEEh */
+        };
+    } address_lo;   /* low 32 bits of msi message address */
+
+    u32	address_hi;	/* high 32 bits of msi message address */
+    u32	data;		/* msi message data */
+};
+
+unsigned int get_cache_line_size(void);
+void cacheline_flush(char *);
+void flush_all_cache(void);
+void *map_to_nocache_virt(int nr_iommus, u64 maddr);
+u64 alloc_pgtable_maddr(struct domain *d);
+void free_pgtable_maddr(u64 maddr);
+void *map_vtd_domain_page(u64 maddr);
+void unmap_vtd_domain_page(void *va);
+
+void iommu_flush_cache_entry(void *addr);
+void iommu_flush_cache_page(void *addr);
+
+#endif // _VTD_H_
diff -Naurp xen/drivers/passthrough/vtd/x86/Makefile xen-redhat/drivers/passthrough/vtd/x86/Makefile
--- xen/drivers/passthrough/vtd/x86/Makefile
+++ xen-redhat/drivers/passthrough/vtd/x86/Makefile
@@ -0,0 +1 @@
+obj-y += vtd.o
diff -Naurp xen/drivers/passthrough/vtd/x86/vtd.c xen-redhat/drivers/passthrough/vtd/x86/vtd.c
--- xen/drivers/passthrough/vtd/x86/vtd.c
+++ xen-redhat/drivers/passthrough/vtd/x86/vtd.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#include <xen/sched.h>
+#include <xen/domain_page.h>
+#include <asm/paging.h>
+#include <xen/iommu.h>
+#include <xen/numa.h>
+#include "../iommu.h"
+#include "../dmar.h"
+#include "../vtd.h"
+
+void *map_vtd_domain_page(u64 maddr)
+{
+    return map_domain_page(maddr >> PAGE_SHIFT_4K);
+}
+
+void unmap_vtd_domain_page(void *va)
+{
+    unmap_domain_page(va);
+}
+
+/* Allocate page table, return its machine address */
+u64 alloc_pgtable_maddr(struct domain *d)
+{
+    struct page_info *pg;
+    u64 *vaddr;
+    unsigned long mfn;
+
+    pg = alloc_domheap_page(NULL);
+    if ( !pg )
+        return 0;
+    mfn = page_to_mfn(pg);
+    vaddr = map_domain_page(mfn);
+    memset(vaddr, 0, PAGE_SIZE);
+
+    iommu_flush_cache_page(vaddr);
+    unmap_domain_page(vaddr);
+
+    return (u64)mfn << PAGE_SHIFT_4K;
+}
+
+void free_pgtable_maddr(u64 maddr)
+{
+    if ( maddr != 0 )
+        free_domheap_page(maddr_to_page(maddr));
+}
+
+unsigned int get_cache_line_size(void)
+{
+    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+}
+
+void cacheline_flush(char * addr)
+{
+    clflush(addr);
+}
+
+void flush_all_cache()
+{
+    wbinvd();
+}
+
+void *map_to_nocache_virt(int nr_iommus, u64 maddr)
+{
+    set_fixmap_nocache(FIX_IOMMU_REGS_BASE_0 + nr_iommus, maddr);
+    return (void *)fix_to_virt(FIX_IOMMU_REGS_BASE_0 + nr_iommus);
+}
+
+struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain)
+{
+    if ( !domain )
+        return NULL;
+
+    return domain->arch.hvm_domain.irq.dpci;
+}
+
+int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci)
+{
+    if ( !domain || !dpci )
+        return 0;
+
+    domain->arch.hvm_domain.irq.dpci = dpci;
+    return 1;
+}
+
+void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq)
+{
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+    struct hvm_irq_dpci *dpci = NULL;
+    struct dev_intx_gsi_link *digl, *tmp;
+    int i;
+
+    ASSERT(isairq < NR_ISAIRQS);
+    if ( !vtd_enabled)
+        return;
+
+    spin_lock(&d->event_lock);
+
+    dpci = domain_get_irq_dpci(d);
+
+    if ( !dpci || !test_bit(isairq, dpci->isairq_map) )
+    {
+        spin_unlock(&d->event_lock);
+        return;
+    }
+    /* Multiple mirq may be mapped to one isa irq */
+    for ( i = find_first_bit(dpci->mapping, NR_IRQS);
+          i < NR_IRQS;
+          i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) )
+    {
+        list_for_each_entry_safe ( digl, tmp,
+            &dpci->mirq[i].digl_list, list )
+        {
+            if ( hvm_irq->pci_link.route[digl->link] == isairq )
+            {
+                hvm_pci_intx_deassert(d, digl->device, digl->intx);
+                if ( --dpci->mirq[i].pending == 0 )
+                {
+                    stop_timer(&dpci->hvm_timer[domain_irq_to_vector(d, i)]);
+                    pirq_guest_eoi(d, i);
+                }
+            }
+        }
+    }
+    spin_unlock(&d->event_lock);
+}
diff -Naurp xen/drivers/pci/Makefile xen-redhat/drivers/pci/Makefile
--- xen/drivers/pci/Makefile
+++ xen-redhat/drivers/pci/Makefile
@@ -0,0 +1 @@
+obj-y += pci.o
diff -Naurp xen/drivers/pci/pci.c xen-redhat/drivers/pci/pci.c
--- xen/drivers/pci/pci.c
+++ xen-redhat/drivers/pci/pci.c
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * pci.c
+ *
+ * Architecture-independent PCI access functions.
+ */
+
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+
+int pci_find_cap_offset(u8 bus, u8 dev, u8 func, u8 cap)
+{
+    u8 id;
+    int max_cap = 48;
+    u8 pos = PCI_CAPABILITY_LIST;
+    u16 status;
+
+    status = pci_conf_read16(bus, dev, func, PCI_STATUS);
+    if ( (status & PCI_STATUS_CAP_LIST) == 0 )
+        return 0;
+
+    while ( max_cap-- )
+    {
+        pos = pci_conf_read8(bus, dev, func, pos);
+        if ( pos < 0x40 )
+            break;
+
+        pos &= ~3;
+        id = pci_conf_read8(bus, dev, func, pos + PCI_CAP_LIST_ID);
+
+        if ( id == 0xff )
+            break;
+        else if ( id == cap )
+            return pos;
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+
+    return 0;
+}
+
+int pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap)
+{
+    u8 id;
+    int ttl = 48;
+
+    while ( ttl-- )
+    {
+        pos = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos);
+        if ( pos < 0x40 )
+            break;
+
+        pos &= ~3;
+        id = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                            pos + PCI_CAP_LIST_ID);
+
+        if ( id == 0xff )
+            break;
+        if ( id == cap )
+            return pos;
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
diff -Naurp xen/drivers/video/vesa.c xen-redhat/drivers/video/vesa.c
--- xen/drivers/video/vesa.c
+++ xen-redhat/drivers/video/vesa.c
@@ -146,10 +146,20 @@ void __init vesa_init(void)
     xfree(text_buf);
 }
 
-void __init vesa_endboot(void)
+void __init vesa_endboot(bool_t keep)
 {
-    xpos = 0;
-    vga_puts = vesa_scroll_puts;
+    if ( keep )
+    {
+        xpos = 0;
+        vga_puts = vesa_scroll_puts;
+    }
+    else
+    {
+        unsigned int i, bpp = (vlfb_info.bits_per_pixel + 7) >> 3;
+        for ( i = 0; i < vlfb_info.height; i++ )
+            memset(lfb + i * vlfb_info.bytes_per_line, 0,
+                   vlfb_info.width * bpp);
+    }
 }
 
 #if defined(CONFIG_X86)
diff -Naurp xen/drivers/video/vga.c xen-redhat/drivers/video/vga.c
--- xen/drivers/video/vga.c
+++ xen-redhat/drivers/video/vga.c
@@ -57,10 +57,10 @@ static unsigned int columns, lines;
 
 #ifdef CONFIG_X86_64
 void vesa_early_init(void);
-void vesa_endboot(void);
+void vesa_endboot(bool_t keep);
 #else
 #define vesa_early_init() ((void)0)
-#define vesa_endboot()    ((void)0)
+#define vesa_endboot(x)   ((void)0)
 #endif
 
 void __init vga_init(void)
@@ -99,16 +99,27 @@ void __init vga_init(void)
 
 void __init vga_endboot(void)
 {
-    if ( vga_puts == vga_noop_puts )
+    if ( !vga_console_info.video_type )
         return;
 
     printk("Xen is %s VGA console.\n",
            vgacon_keep ? "keeping" : "relinquishing");
 
-    vesa_endboot();
-
     if ( !vgacon_keep )
         vga_puts = vga_noop_puts;
+
+    switch ( vga_console_info.video_type )
+    {
+    case XEN_VGATYPE_TEXT_MODE_3:
+        if ( !vgacon_keep )
+            memset(video, 0, columns * lines * 2);
+        break;
+    case XEN_VGATYPE_VESA_LFB:
+        vesa_endboot(vgacon_keep);
+        break;
+    default:
+        BUG();
+    }
 }
 
 static void vga_text_puts(const char *s)
diff -Naurp xen/include/asm-ia64/bundle.h xen-redhat/include/asm-ia64/bundle.h
--- xen/include/asm-ia64/bundle.h
+++ xen-redhat/include/asm-ia64/bundle.h
@@ -33,6 +33,11 @@ typedef union U_INST64_B9 {
     struct { unsigned long qp:6, imm20:20, :1, x6:6, :3, i:1, major:4; };
 } INST64_B9;
 
+typedef union U_INST64_I18 {
+    IA64_INST inst;
+    struct { unsigned long qp:6, imm20:20, y:1, x6:6, x3:3, i:1, major:4; };
+} INST64_I18;
+
 typedef union U_INST64_I19 {
     IA64_INST inst;
     struct { unsigned long qp:6, imm20:20, :1, x6:6, x3:3, i:1, major:4; };
@@ -191,6 +196,7 @@ typedef union U_INST64 {
     INST64_B4 B4;	// used in build_hypercall_bundle only
     INST64_B8 B8;	// rfi, bsw.[01]
     INST64_B9 B9;	// break.b
+    INST64_I18 I18;	// nop.i used in build_fpswa_hypercall_bundle only
     INST64_I19 I19;	// used in build_hypercall_bundle only
     INST64_I26 I26;	// mov register to ar (I unit)
     INST64_I27 I27;	// mov immediate to ar (I unit)
diff -Naurp xen/include/asm-ia64/config.h xen-redhat/include/asm-ia64/config.h
--- xen/include/asm-ia64/config.h
+++ xen-redhat/include/asm-ia64/config.h
@@ -282,4 +282,6 @@ struct screen_info { };
 /* Define CONFIG_PRIVIFY to support privified OS (deprecated).  */
 #undef CONFIG_PRIVIFY
 
+#define ARCH_CRASH_SAVE_VMCOREINFO
+
 #endif	/* _IA64_CONFIG_H_ */
diff -Naurp xen/include/asm-ia64/debugger.h xen-redhat/include/asm-ia64/debugger.h
--- xen/include/asm-ia64/debugger.h
+++ xen-redhat/include/asm-ia64/debugger.h
@@ -56,13 +56,6 @@ show_execution_state(struct cpu_user_reg
 #ifdef CRASH_DEBUG
 // crash_debug=y
 
-/* The main trap handlers use these helper macros which include early bail. */
-static inline int debugger_trap_entry(
-    unsigned int vector, struct cpu_user_regs *regs)
-{
-    return 0;
-}
-
 extern int __trap_to_cdb(struct cpu_user_regs *r);
 static inline int debugger_trap_fatal(
     unsigned int vector, struct cpu_user_regs *regs)
@@ -73,23 +66,18 @@ static inline int debugger_trap_fatal(
 
 #define ____debugger_trap_immediate(b) __asm__ __volatile__ ("break.m "#b"\n")
 #define __debugger_trap_immediate(b) ____debugger_trap_immediate(b)
-#define debugger_trap_immediate() __debugger_trap_immediate(CDB_BREAK_NUM)
+#define debugger_trap_immediate()					\
+do {									\
+    if ( gdb_ctx->serhnd >= 0 )						\
+        __debugger_trap_immediate(CDB_BREAK_NUM);			\
+} while (0)
 
 //XXX temporal work around
 #ifndef CONFIG_SMP
 #define smp_send_stop()	/* nothing */
 #endif
 
-#elif defined DOMU_DEBUG
-// domu_debug=y
-#warning "domu_debug is not implemented yet."
-/* The main trap handlers use these helper macros which include early bail. */
-static inline int debugger_trap_entry(
-    unsigned int vector, struct cpu_user_regs *regs)
-{
-    return 0;
-}
-
+#else
 static inline int debugger_trap_fatal(
     unsigned int vector, struct cpu_user_regs *regs)
 {
@@ -97,22 +85,21 @@ static inline int debugger_trap_fatal(
 }
 
 #define debugger_trap_immediate()		((void)0)
-#else
-/* The main trap handlers use these helper macros which include early bail. */
+#endif
+
 static inline int debugger_trap_entry(
     unsigned int vector, struct cpu_user_regs *regs)
 {
-    return 0;
-}
+    struct vcpu *v = current;
+
+    if (guest_kernel_mode(regs) && v->domain->debugger_attached) {
+        domain_pause_for_debugger();
+        return 1;
+    }
 
-static inline int debugger_trap_fatal(
-    unsigned int vector, struct cpu_user_regs *regs)
-{
     return 0;
 }
 
-#define debugger_trap_immediate()		((void)0)
-#endif
 #endif // __ASSEMBLLY__
 
 #endif /* __ASM_DEBUGGER_H__ */
diff -Naurp xen/include/asm-ia64/domain.h xen-redhat/include/asm-ia64/domain.h
--- xen/include/asm-ia64/domain.h
+++ xen-redhat/include/asm-ia64/domain.h
@@ -18,7 +18,6 @@ struct p2m_entry;
 struct tlb_track;
 #endif
 
-extern void domain_relinquish_resources(struct domain *);
 struct vcpu;
 extern void relinquish_vcpu_resources(struct vcpu *v);
 extern void vcpu_share_privregs_with_guest(struct vcpu *v);
@@ -132,6 +131,19 @@ struct arch_domain {
 #ifdef CONFIG_XEN_IA64_TLB_TRACK
     struct tlb_track*   tlb_track;
 #endif
+
+    /* for domctl_destroy_domain continuation */
+    enum {
+        RELRES_not_started,
+        RELRES_mm_teardown,
+        RELRES_xen,
+        RELRES_dom,
+        RELRES_done,
+    } relres;
+    /* Continuable mm_teardown() */
+    unsigned long mm_teardown_offset;
+    /* Continuable domain_relinquish_resources() */
+    struct list_head relmem_list;
 };
 #define INT_ENABLE_OFFSET(v) 		  \
     (sizeof(vcpu_info_t) * (v)->vcpu_id + \
@@ -180,6 +192,11 @@ struct arch_vcpu {
     int starting_rid;		/* first RID assigned to domain */
     int ending_rid;		/* one beyond highest RID assigned to domain */
 
+    /* Bitset for debug register use.  */
+    unsigned int dbg_used;
+    u64 dbr[IA64_NUM_DBG_REGS];
+    u64 ibr[IA64_NUM_DBG_REGS];
+
     struct thread_struct _thread;	// this must be last
 
     thash_cb_t vtlb;
@@ -188,9 +205,10 @@ struct arch_vcpu {
     char irq_new_condition;    // vpsr.i/vtpr change, check for pending VHPI
     char hypercall_continuation;
 
+    fpswa_ret_t fpswa_ret;	/* save return values of FPSWA emulation */
+
     //for phycial  emulation
     int mode_flags;
-    fpswa_ret_t fpswa_ret;	/* save return values of FPSWA emulation */
     struct timer hlt_timer;
     struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
 
@@ -216,6 +234,9 @@ int
 do_perfmon_op(unsigned long cmd,
               XEN_GUEST_HANDLE(void) arg1, unsigned long arg2);
 
+void
+ia64_lazy_load_fpu(struct vcpu *vcpu);
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff -Naurp xen/include/asm-ia64/dom_fw.h xen-redhat/include/asm-ia64/dom_fw.h
--- xen/include/asm-ia64/dom_fw.h
+++ xen-redhat/include/asm-ia64/dom_fw.h
@@ -7,10 +7,13 @@
 
 #include <linux/efi.h>
 
+#define __IA64_XEN_HYPERCALL_DEFAULT            0x1000
+#define __IA64_XEN_HYPERCALL_DEFAULT_STR        "0x1000"
+
 /* Portion of guest physical memory space reserved for PAL/SAL/EFI/ACPI
    data and code.  */
 #define FW_BASE_PADDR		0x0000UL
-#define FW_END_PADDR		0x3000UL
+#define FW_END_PADDR		0x8000UL
 
 /* This is used to determined the portion of a domain's metaphysical memory
    space reserved for the hypercall patch table. */
@@ -30,8 +33,8 @@
 #define FW_ACPI_END_PADDR	0x2000UL
 
 /* Base and end guest physical address of EFI and SAL (non-ACPI) tables.  */
-#define FW_TABLES_BASE_PADDR	0x2000UL
-#define FW_TABLES_END_PADDR	0x3000UL
+#define FW_TABLES_BASE_PADDR	0x4000UL
+#define FW_TABLES_END_PADDR	0x8000UL
 
 
 /* Hypercalls number have a low part and a high part.
@@ -157,13 +160,21 @@
 
 /*
  * This is a hypercall number for FPSWA.
- * FPSWA hypercall uses 2 bundles for a pseudo-entry-point and a hypercall-patch.
+ * FPSWA hypercall uses one bundle for a pseudo-entry-point
+ * and 14 bundles for a hypercall-patch.
+ *
+ * 0x500 was used before. But that implemetation is broken.
+ * To keep hypercall abi, 0x500 is obsoleted and allocate 0x501 for 
+ * fspwa hypercall.
  */
 #define FW_HYPERCALL_FPSWA_ENTRY_INDEX			0x90UL
 #define FW_HYPERCALL_FPSWA_PATCH_INDEX			0x91UL
 #define FW_HYPERCALL_FPSWA_ENTRY_PADDR			FW_HYPERCALL_PADDR(FW_HYPERCALL_FPSWA_ENTRY_INDEX)
 #define FW_HYPERCALL_FPSWA_PATCH_PADDR			FW_HYPERCALL_PADDR(FW_HYPERCALL_FPSWA_PATCH_INDEX)
-#define FW_HYPERCALL_FPSWA				0x500UL
+#define FW_HYPERCALL_FPSWA_BASE				0x500UL
+#define FW_HYPERCALL_FPSWA_BROKEN			0x500UL
+#define FW_HYPERCALL_FPSWA				0x501UL
+#define FW_HYPERCALL_FPSWA_STR				"0x501"
 
 /* Set the shared_info base virtual address.  */
 #define FW_HYPERCALL_SET_SHARED_INFO_VA			0x600UL
diff -Naurp xen/include/asm-ia64/grant_table.h xen-redhat/include/asm-ia64/grant_table.h
--- xen/include/asm-ia64/grant_table.h
+++ xen-redhat/include/asm-ia64/grant_table.h
@@ -12,7 +12,7 @@ int create_grant_host_mapping(unsigned l
 int destroy_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned int flags);
 
 // for grant transfer
-void guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 /* XXX
  * somewhere appropriate
diff -Naurp xen/include/asm-ia64/hvm/vacpi.h xen-redhat/include/asm-ia64/hvm/vacpi.h
--- xen/include/asm-ia64/hvm/vacpi.h
+++ xen-redhat/include/asm-ia64/hvm/vacpi.h
@@ -0,0 +1,55 @@
+/*
+ * vacpi.h: Virtual ACPI definitions
+ *
+ * Copyright (c) 2007, FUJITSU LIMITED
+ *      Kouya Shimura <kouya at jp fujitsu com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#ifndef __ASM_IA64_HVM_VACPI_H__
+#define __ASM_IA64_HVM_VACPI_H__
+
+#include <public/hvm/ioreq.h>
+
+#define ACPI_PM1A_EVT_BLK_ADDRESS 0x0000000000001f40
+#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS   (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08)
+
+#define IS_ACPI_ADDR(X)  ((unsigned long)((X)-ACPI_PM1A_EVT_BLK_ADDRESS)<12)
+
+#define FREQUENCE_PMTIMER  3579545UL	/* Timer should run at 3.579545 MHz */
+
+struct vacpi_regs {
+	union {
+		struct {
+			uint32_t pm1a_sts:16;
+			uint32_t pm1a_en:16;
+		};
+		uint32_t evt_blk;
+	};
+	uint32_t tmr_val;
+};
+
+struct vacpi {
+	struct vacpi_regs regs;
+	s_time_t last_gtime;
+	struct timer timer;
+};
+
+int vacpi_intercept(ioreq_t * p, u64 * val);
+void vacpi_init(struct domain *d);
+void vacpi_relinquish_resources(struct domain *d);
+
+#endif	/* __ASM_IA64_HVM_VACPI_H__ */
diff -Naurp xen/include/asm-ia64/linux/asm/sn/pcidev.h xen-redhat/include/asm-ia64/linux/asm/sn/pcidev.h
--- xen/include/asm-ia64/linux/asm/sn/pcidev.h
+++ xen-redhat/include/asm-ia64/linux/asm/sn/pcidev.h
@@ -1,83 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved.
- */
-#ifndef _ASM_IA64_SN_PCI_PCIDEV_H
-#define _ASM_IA64_SN_PCI_PCIDEV_H
-
-#include <linux/pci.h>
-
-/*
- * In ia64, pci_dev->sysdata must be a *pci_controller. To provide access to
- * the pcidev_info structs for all devices under a controller, we extend the
- * definition of pci_controller, via sn_pci_controller, to include a list
- * of pcidev_info.
- */
-struct sn_pci_controller {
-	struct pci_controller pci_controller;
-	struct list_head pcidev_info;
-};
-
-#define SN_PCI_CONTROLLER(dev) ((struct sn_pci_controller *) dev->sysdata)
-
-#define SN_PCIDEV_INFO(dev)	sn_pcidev_info_get(dev)
-
-#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \
-	(struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data))
-/*
- * Given a pci_bus, return the sn pcibus_bussoft struct.  Note that
- * this only works for root busses, not for busses represented by PPB's.
- */
-
-#define SN_PCIBUS_BUSSOFT(pci_bus) \
-        ((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data))
-
-#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \
-	(struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data))
-/*
- * Given a struct pci_dev, return the sn pcibus_bussoft struct.  Note
- * that this is not equivalent to SN_PCIBUS_BUSSOFT(pci_dev->bus) due
- * due to possible PPB's in the path.
- */
-
-#define SN_PCIDEV_BUSSOFT(pci_dev) \
-	(SN_PCIDEV_INFO(pci_dev)->pdi_host_pcidev_info->pdi_pcibus_info)
-
-#define SN_PCIDEV_BUSPROVIDER(pci_dev) \
-	(SN_PCIDEV_INFO(pci_dev)->pdi_provider)
-
-#define PCIIO_BUS_NONE	255      /* bus 255 reserved */
-#define PCIIO_SLOT_NONE 255
-#define PCIIO_FUNC_NONE 255
-#define PCIIO_VENDOR_ID_NONE	(-1)
-
-struct pcidev_info {
-	u64		pdi_pio_mapped_addr[7]; /* 6 BARs PLUS 1 ROM */
-	u64		pdi_slot_host_handle;	/* Bus and devfn Host pci_dev */
-
-	struct pcibus_bussoft	*pdi_pcibus_info;	/* Kernel common bus soft */
-	struct pcidev_info	*pdi_host_pcidev_info;	/* Kernel Host pci_dev */
-	struct pci_dev		*pdi_linux_pcidev;	/* Kernel pci_dev */
-
-	struct sn_irq_info	*pdi_sn_irq_info;
-	struct sn_pcibus_provider *pdi_provider;	/* sn pci ops */
-	struct pci_dev 		*host_pci_dev;		/* host bus link */
-	struct list_head	pdi_list;		/* List of pcidev_info */
-};
-
-extern void sn_irq_fixup(struct pci_dev *pci_dev,
-			 struct sn_irq_info *sn_irq_info);
-extern void sn_irq_unfixup(struct pci_dev *pci_dev);
-extern struct pcidev_info * sn_pcidev_info_get(struct pci_dev *);
-extern void sn_pci_controller_fixup(int segment, int busnum,
- 				    struct pci_bus *bus);
-extern void sn_bus_store_sysdata(struct pci_dev *dev);
-extern void sn_bus_free_sysdata(void);
-extern void sn_generate_path(struct pci_bus *pci_bus, char *address);
-extern void sn_pci_fixup_slot(struct pci_dev *dev);
-extern void sn_pci_unfixup_slot(struct pci_dev *dev);
-extern void sn_irq_lh_init(void);
-#endif				/* _ASM_IA64_SN_PCI_PCIDEV_H */
diff -Naurp xen/include/asm-ia64/linux/asm/sn/README.origin xen-redhat/include/asm-ia64/linux/asm/sn/README.origin
--- xen/include/asm-ia64/linux/asm/sn/README.origin
+++ xen-redhat/include/asm-ia64/linux/asm/sn/README.origin
@@ -10,7 +10,6 @@ l1.h			-> linux/include/asm-ia64/sn/l1.h
 leds.h			-> linux/include/asm-ia64/sn/leds.h
 module.h		-> linux/include/asm-ia64/sn/module.h
 pcibus_provider_defs.h	-> linux/include/asm-ia64/sn/pcibus_provider_defs.h
-pcidev.h		-> linux/include/asm-ia64/sn/pcidev.h
 pda.h			-> linux/include/asm-ia64/sn/pda.h
 pic.h			-> linux/include/asm-ia64/sn/pic.h
 shub_mmr.h		-> linux/include/asm-ia64/sn/shub_mmr.h
diff -Naurp xen/include/asm-ia64/linux/pci_regs.h xen-redhat/include/asm-ia64/linux/pci_regs.h
--- xen/include/asm-ia64/linux/pci_regs.h
+++ xen-redhat/include/asm-ia64/linux/pci_regs.h
@@ -229,7 +229,7 @@
 #define  PCI_PM_CAP_PME_D3cold	0x8000	/* PME# from D3 (cold) */
 #define PCI_PM_CTRL		4	/* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0004	/* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0008	/* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
diff -Naurp xen/include/asm-ia64/linux-xen/asm/pal.h xen-redhat/include/asm-ia64/linux-xen/asm/pal.h
--- xen/include/asm-ia64/linux-xen/asm/pal.h
+++ xen-redhat/include/asm-ia64/linux-xen/asm/pal.h
@@ -20,6 +20,8 @@
  * 00/05/24     eranian Updated to latest PAL spec, fix structures bugs, added
  * 00/05/25	eranian Support for stack calls, and static physical calls
  * 00/06/18	eranian Support for stacked physical calls
+ * 06/10/26	rja	Support for Intel Itanium Architecture Software Developer's
+ *			Manual Rev 2.2 (Jan 2006)
  */
 
 /*
@@ -30,7 +32,7 @@
 #define PAL_CACHE_FLUSH		1	/* flush i/d cache */
 #define PAL_CACHE_INFO		2	/* get detailed i/d cache info */
 #define PAL_CACHE_INIT		3	/* initialize i/d cache */
-#define PAL_CACHE_SUMMARY	4	/* get summary of cache heirarchy */
+#define PAL_CACHE_SUMMARY	4	/* get summary of cache hierarchy */
 #define PAL_MEM_ATTRIB		5	/* list supported memory attributes */
 #define PAL_PTCE_INFO		6	/* purge TLB info */
 #define PAL_VM_INFO		7	/* return supported virtual memory features */
@@ -68,6 +70,9 @@
 #define PAL_SHUTDOWN		40	/* enter processor shutdown state */
 #define PAL_PREFETCH_VISIBILITY	41	/* Make Processor Prefetches Visible */
 #define PAL_LOGICAL_TO_PHYSICAL 42	/* returns information on logical to physical processor mapping */
+#define PAL_CACHE_SHARED_INFO	43	/* returns information on caches shared by logical processor */
+#define PAL_GET_HW_POLICY	48	/* Get current hardware resource sharing policy */
+#define PAL_SET_HW_POLICY	49	/* Set current hardware resource sharing policy */
 
 #define PAL_COPY_PAL		256	/* relocate PAL procedures and PAL PMI */
 #define PAL_HALT_INFO		257	/* return the low power capabilities of processor */
@@ -75,6 +80,14 @@
 #define PAL_CACHE_READ		259	/* read tag & data of cacheline for diagnostic testing */
 #define PAL_CACHE_WRITE		260	/* write tag & data of cacheline for diagnostic testing */
 #define PAL_VM_TR_READ		261	/* read contents of translation register */
+#define PAL_GET_PSTATE		262	/* get the current P-state */
+#define PAL_SET_PSTATE		263	/* set the P-state */
+#define PAL_BRAND_INFO		274	/* Processor branding information */
+
+#define PAL_GET_PSTATE_TYPE_LASTSET	0
+#define PAL_GET_PSTATE_TYPE_AVGANDRESET	1
+#define PAL_GET_PSTATE_TYPE_AVGNORESET	2
+#define PAL_GET_PSTATE_TYPE_INSTANT	3
 
 #ifndef __ASSEMBLY__
 
@@ -98,15 +111,16 @@ typedef s64				pal_status_t;
 						 * cache without sideeffects
 						 * and "restrict" was 1
 						 */
+#define PAL_STATUS_REQUIRES_MEMORY	(-9)	/* Call requires PAL memory buffer */
 
-/* Processor cache level in the heirarchy */
+/* Processor cache level in the hierarchy */
 typedef u64				pal_cache_level_t;
 #define PAL_CACHE_LEVEL_L0		0	/* L0 */
 #define PAL_CACHE_LEVEL_L1		1	/* L1 */
 #define PAL_CACHE_LEVEL_L2		2	/* L2 */
 
 
-/* Processor cache type at a particular level in the heirarchy */
+/* Processor cache type at a particular level in the hierarchy */
 
 typedef u64				pal_cache_type_t;
 #define PAL_CACHE_TYPE_INSTRUCTION	1	/* Instruction cache */
@@ -131,7 +145,7 @@ typedef u64				pal_cache_line_state_t;
 #define PAL_CACHE_LINE_STATE_MODIFIED	3	/* Modified */
 
 typedef struct pal_freq_ratio {
-	u64 den : 32, num : 32;	/* numerator & denominator */
+	u32 den, num;		/* numerator & denominator */
 } itc_ratio, proc_ratio;
 
 typedef	union  pal_cache_config_info_1_s {
@@ -152,10 +166,10 @@ typedef	union  pal_cache_config_info_1_s
 
 typedef	union  pal_cache_config_info_2_s {
 	struct {
-		u64		cache_size	: 32,	/*cache size in bytes*/
+		u32		cache_size;		/*cache size in bytes*/
 
 
-				alias_boundary	: 8,	/* 39-32 aliased addr
+		u32		alias_boundary	: 8,	/* 39-32 aliased addr
 							 * separation for max
 							 * performance.
 							 */
@@ -261,14 +275,14 @@ typedef struct pal_cache_protection_info
 #define PAL_CACHE_PROT_METHOD_ECC		3	/* ECC protection */
 
 
-/* Processor cache line identification in the heirarchy */
+/* Processor cache line identification in the hierarchy */
 typedef union pal_cache_line_id_u {
 	u64			pclid_data;
 	struct {
 		u64		cache_type	: 8,	/* 7-0 cache type */
 				level		: 8,	/* 15-8 level of the
 							 * cache in the
-							 * heirarchy.
+							 * hierarchy.
 							 */
 				way		: 8,	/* 23-16 way in the set
 							 */
@@ -281,7 +295,7 @@ typedef union pal_cache_line_id_u {
 		u64		cache_type	: 8,	/* 7-0 cache type */
 				level		: 8,	/* 15-8 level of the
 							 * cache in the
-							 * heirarchy.
+							 * hierarchy.
 							 */
 				way		: 8,	/* 23-16 way in the set
 							 */
@@ -360,6 +374,7 @@ typedef u64					pal_mc_info_index_t;
 							 * dependent
 							 */
 
+#define PAL_TLB_CHECK_OP_PURGE			8
 
 typedef struct pal_process_state_info_s {
 	u64		reserved1	: 2,
@@ -455,7 +470,9 @@ typedef struct pal_process_state_info_s 
 						 * by the processor
 						 */
 
-			reserved2	: 11,
+			se		: 1,	/* Shared error.  MCA in a
+						   shared structure */
+			reserved2	: 10,
 			cc		: 1,	/* Cache check */
 			tc		: 1,	/* TLB check */
 			bc		: 1,	/* Bus check */
@@ -486,10 +503,12 @@ typedef struct pal_cache_check_info_s {
 						 * error occurred
 						 */
 			wiv		: 1,	/* Way field valid */
-			reserved2	: 10,
+			reserved2	: 1,
+			dp		: 1,	/* Data poisoned on MBE */
+			reserved3	: 8,
 
 			index		: 20,	/* Cache line index */
-			reserved3	: 2,
+			reserved4	: 2,
 
 			is		: 1,	/* instruction set (1 == ia32) */
 			iv		: 1,	/* instruction set field valid */
@@ -556,7 +575,7 @@ typedef struct pal_bus_check_info_s {
 			type		: 8,	/* Bus xaction type*/
 			sev		: 5,	/* Bus error severity*/
 			hier		: 2,	/* Bus hierarchy level */
-			reserved1	: 1,
+			dp		: 1,	/* Data poisoned on MBE */
 			bsi		: 8,	/* Bus error status
 						 * info
 						 */
@@ -763,7 +782,7 @@ struct ia64_pal_retval {
  * (generally 0) MUST be passed.  Reserved parameters are not optional
  * parameters.
  */
-extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64, u64);
+extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64);
 extern struct ia64_pal_retval ia64_pal_call_stacked (u64, u64, u64, u64);
 extern struct ia64_pal_retval ia64_pal_call_phys_static (u64, u64, u64, u64);
 extern struct ia64_pal_retval ia64_pal_call_phys_stacked (u64, u64, u64, u64);
@@ -773,14 +792,7 @@ extern void ia64_load_scratch_fpregs (st
 #define PAL_CALL(iprv,a0,a1,a2,a3) do {			\
 	struct ia64_fpreg fr[6];			\
 	ia64_save_scratch_fpregs(fr);			\
-	iprv = ia64_pal_call_static(a0, a1, a2, a3, 0);	\
-	ia64_load_scratch_fpregs(fr);			\
-} while (0)
-
-#define PAL_CALL_IC_OFF(iprv,a0,a1,a2,a3) do {		\
-	struct ia64_fpreg fr[6];			\
-	ia64_save_scratch_fpregs(fr);			\
-	iprv = ia64_pal_call_static(a0, a1, a2, a3, 1);	\
+	iprv = ia64_pal_call_static(a0, a1, a2, a3);	\
 	ia64_load_scratch_fpregs(fr);			\
 } while (0)
 
@@ -840,7 +852,9 @@ typedef union pal_bus_features_u {
 		u64	pbf_req_bus_parking			:	1;
 		u64	pbf_bus_lock_mask			:	1;
 		u64	pbf_enable_half_xfer_rate		:	1;
-		u64	pbf_reserved2				:	22;
+		u64	pbf_reserved2				:	20;
+		u64	pbf_enable_shared_line_replace		:	1;
+		u64	pbf_enable_exclusive_line_replace	:	1;
 		u64	pbf_disable_xaction_queueing		:	1;
 		u64	pbf_disable_resp_err_check		:	1;
 		u64	pbf_disable_berr_check			:	1;
@@ -928,11 +942,7 @@ static inline s64
 ia64_pal_cache_flush (u64 cache_type, u64 invalidate, u64 *progress, u64 *vector)
 {
 	struct ia64_pal_retval iprv;
-#ifdef XEN	/* fix a bug in Linux... PAL has changed */
 	PAL_CALL(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress);
-#else
-	PAL_CALL_IC_OFF(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress);
-#endif
 	if (vector)
 		*vector = iprv.v0;
 	*progress = iprv.v1;
@@ -967,11 +977,12 @@ static inline s64
 ia64_pal_cache_read (pal_cache_line_id_u_t line_id, u64 physical_addr)
 {
 	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_READ, line_id.pclid_data, physical_addr, 0);
+	PAL_CALL_PHYS_STK(iprv, PAL_CACHE_READ, line_id.pclid_data,
+				physical_addr, 0);
 	return iprv.status;
 }
 
-/* Return summary information about the heirarchy of caches controlled by the processor */
+/* Return summary information about the hierarchy of caches controlled by the processor */
 static inline s64
 ia64_pal_cache_summary (u64 *cache_levels, u64 *unique_caches)
 {
@@ -989,7 +1000,8 @@ static inline s64
 ia64_pal_cache_write (pal_cache_line_id_u_t line_id, u64 physical_addr, u64 data)
 {
 	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_WRITE, line_id.pclid_data, physical_addr, data);
+	PAL_CALL_PHYS_STK(iprv, PAL_CACHE_WRITE, line_id.pclid_data,
+				physical_addr, data);
 	return iprv.status;
 }
 
@@ -1085,6 +1097,24 @@ ia64_pal_freq_ratios (struct pal_freq_ra
 	return iprv.status;
 }
 
+/*
+ * Get the current hardware resource sharing policy of the processor
+ */
+static inline s64
+ia64_pal_get_hw_policy (u64 proc_num, u64 *cur_policy, u64 *num_impacted,
+			u64 *la)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL(iprv, PAL_GET_HW_POLICY, proc_num, 0, 0);
+	if (cur_policy)
+		*cur_policy = iprv.v0;
+	if (num_impacted)
+		*num_impacted = iprv.v1;
+	if (la)
+		*la = iprv.v2;
+	return iprv.status;
+}
+
 /* Make the processor enter HALT or one of the implementation dependent low
  * power states where prefetching and execution are suspended and cache and
  * TLB coherency is not maintained.
@@ -1118,6 +1148,34 @@ ia64_pal_halt_info (pal_power_mgmt_info_
 	return iprv.status;
 }
 
+/* Get the current P-state information */
+static inline s64
+ia64_pal_get_pstate (u64 *pstate_index, unsigned long type)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL_STK(iprv, PAL_GET_PSTATE, type, 0, 0);
+	*pstate_index = iprv.v0;
+	return iprv.status;
+}
+
+/* Set the P-state */
+static inline s64
+ia64_pal_set_pstate (u64 pstate_index)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL_STK(iprv, PAL_SET_PSTATE, pstate_index, 0, 0);
+	return iprv.status;
+}
+
+/* Processor branding information*/
+static inline s64
+ia64_pal_get_brand_info (char *brand_info)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL_STK(iprv, PAL_BRAND_INFO, 0, (u64)brand_info, 0);
+	return iprv.status;
+}
+
 /* Cause the processor to enter LIGHT HALT state, where prefetching and execution are
  * suspended, but cache and TLB coherency is maintained.
  */
@@ -1381,6 +1439,17 @@ ia64_pal_rse_info (u64 *num_phys_stacked
 	return iprv.status;
 }
 
+/*
+ * Set the current hardware resource sharing policy of the processor
+ */
+static inline s64
+ia64_pal_set_hw_policy (u64 policy)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL(iprv, PAL_SET_HW_POLICY, policy, 0, 0);
+	return iprv.status;
+}
+
 /* Cause the processor to enter	SHUTDOWN state, where prefetching and execution are
  * suspended, but cause cache and TLB coherency to be maintained.
  * This is usually called in IA-32 mode.
@@ -1418,7 +1487,12 @@ typedef union  pal_version_u {
 } pal_version_u_t;
 
 
-/* Return PAL version information */
+/*
+ * Return PAL version information.  While the documentation states that
+ * PAL_VERSION can be called in either physical or virtual mode, some
+ * implementations only allow physical calls.  We don't call it very often,
+ * so the overhead isn't worth eliminating.
+ */
 static inline s64
 ia64_pal_version (pal_version_u_t *pal_min_version, pal_version_u_t *pal_cur_version)
 {
@@ -1499,12 +1573,15 @@ typedef union pal_vm_info_1_u {
 	} pal_vm_info_1_s;
 } pal_vm_info_1_u_t;
 
+#define PAL_MAX_PURGES		0xFFFF		/* all ones is means unlimited */
+
 typedef union pal_vm_info_2_u {
 	u64			pvi2_val;
 	struct {
 		u64		impl_va_msb	: 8,
 				rid_size	: 8,
-				reserved	: 48;
+				max_purges	: 16,
+				reserved	: 32;
 	} pal_vm_info_2_s;
 } pal_vm_info_2_u_t;
 
@@ -1626,14 +1703,40 @@ ia64_pal_logical_to_phys(u64 proc_number
 
 	if (iprv.status == PAL_STATUS_SUCCESS)
 	{
-		if (proc_number == 0)
-			mapping->overview.overview_data = iprv.v0;
+		mapping->overview.overview_data = iprv.v0;
 		mapping->ppli1.ppli1_data = iprv.v1;
 		mapping->ppli2.ppli2_data = iprv.v2;
 	}
 
 	return iprv.status;
 }
+
+typedef struct pal_cache_shared_info_s
+{
+	u64 num_shared;
+	pal_proc_n_log_info1_t ppli1;
+	pal_proc_n_log_info2_t ppli2;
+} pal_cache_shared_info_t;
+
+/* Get information on logical to physical processor mappings. */
+static inline s64
+ia64_pal_cache_shared_info(u64 level,
+		u64 type,
+		u64 proc_number,
+		pal_cache_shared_info_t *info)
+{
+	struct ia64_pal_retval iprv;
+
+	PAL_CALL(iprv, PAL_CACHE_SHARED_INFO, level, type, proc_number);
+
+	if (iprv.status == PAL_STATUS_SUCCESS) {
+		info->num_shared = iprv.v0;
+		info->ppli1.ppli1_data = iprv.v1;
+		info->ppli2.ppli2_data = iprv.v2;
+	}
+
+	return iprv.status;
+}
 #ifdef XEN
 #include <asm/vmx_pal.h>
 #endif
diff -Naurp xen/include/asm-ia64/linux-xen/asm/processor.h xen-redhat/include/asm-ia64/linux-xen/asm/processor.h
--- xen/include/asm-ia64/linux-xen/asm/processor.h
+++ xen-redhat/include/asm-ia64/linux-xen/asm/processor.h
@@ -292,11 +292,14 @@ struct thread_struct {
 #else
 # define INIT_THREAD_PM
 #endif
+#ifndef XEN
 	__u64 dbr[IA64_NUM_DBG_REGS];
 	__u64 ibr[IA64_NUM_DBG_REGS];
+#endif
 	struct ia64_fpreg fph[96];	/* saved/loaded on demand */
 };
 
+#ifndef XEN
 #define INIT_THREAD {						\
 	.flags =	0,					\
 	.on_ustack =	0,					\
@@ -333,6 +336,7 @@ struct thread_struct {
 		regs->r1 = 0; regs->r9  = 0; regs->r11 = 0; regs->r13 = 0; regs->r15 = 0;	\
 	}											\
 } while (0)
+#endif
 
 /* Forward declarations, a strange C thing... */
 struct mm_struct;
diff -Naurp xen/include/asm-ia64/linux-xen/asm/ptrace.h xen-redhat/include/asm-ia64/linux-xen/asm/ptrace.h
--- xen/include/asm-ia64/linux-xen/asm/ptrace.h
+++ xen-redhat/include/asm-ia64/linux-xen/asm/ptrace.h
@@ -265,6 +265,10 @@ struct switch_stack {
   /* given a pointer to a task_struct, return the user's pt_regs */
 # define ia64_task_regs(t)		(((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1)
 # define ia64_psr(regs)			((struct ia64_psr *) &(regs)->cr_ipsr)
+#ifdef XEN
+# define guest_kernel_mode(regs)	(ia64_psr(regs)->cpl == 2)
+# define vmx_guest_kernel_mode(regs)	(ia64_psr(regs)->cpl == 0)
+#endif
 # define user_mode(regs)		(((struct ia64_psr *) &(regs)->cr_ipsr)->cpl != 0)
 # define user_stack(task,regs)	((long) regs - (long) task == IA64_STK_OFFSET - sizeof(*regs))
 # define fsys_mode(task,regs)					\
diff -Naurp xen/include/asm-ia64/linux-xen/asm/README.origin xen-redhat/include/asm-ia64/linux-xen/asm/README.origin
--- xen/include/asm-ia64/linux-xen/asm/README.origin
+++ xen-redhat/include/asm-ia64/linux-xen/asm/README.origin
@@ -17,7 +17,6 @@ mca_asm.h		-> linux/include/asm-ia64/mca
 meminit.h		-> linux/include/asm-ia64/meminit.h
 numa.h			-> linux/include/asm-ia64/numa.h
 page.h			-> linux/include/asm-ia64/page.h
-pal.h			-> linux/include/asm-ia64/pal.h
 percpu.h		-> linux/include/asm-ia64/percpu.h
 pgalloc.h		-> linux/include/asm-ia64/pgalloc.h
 pgtable.h		-> linux/include/asm-ia64/pgtable.h
@@ -42,3 +41,6 @@ machvec_dig.h		-> linux/include/asm-ia64
 machvec_sn2.h		-> linux/include/asm-ia64/machvec_sn2.h
 machvec_hpzx1.h		-> linux/include/asm-ia64/machvec_hpzx1.h
 machvec_pci.h		-> linux/include/asm-ia64/pci.h
+
+# The files below are from Linux-2.6.21
+pal.h			-> linux/include/asm-ia64/pal.h
diff -Naurp xen/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h
--- xen/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h
+++ xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcibr_provider.h
@@ -10,7 +10,7 @@
 
 #ifdef XEN
 #include <linux/spinlock.h>
-#include <linux/pci.h>
+#include <linux/linux-pci.h>
 #endif
 #include <asm/sn/intr.h>
 #include <asm/sn/pcibus_provider_defs.h>
diff -Naurp xen/include/asm-ia64/linux-xen/asm/sn/pcidev.h xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcidev.h
--- xen/include/asm-ia64/linux-xen/asm/sn/pcidev.h
+++ xen-redhat/include/asm-ia64/linux-xen/asm/sn/pcidev.h
@@ -0,0 +1,87 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved.
+ */
+#ifndef _ASM_IA64_SN_PCI_PCIDEV_H
+#define _ASM_IA64_SN_PCI_PCIDEV_H
+
+#ifdef XEN
+#include <linux/linux-pci.h>
+#else
+#include <linux/pci.h>
+#endif
+
+/*
+ * In ia64, pci_dev->sysdata must be a *pci_controller. To provide access to
+ * the pcidev_info structs for all devices under a controller, we extend the
+ * definition of pci_controller, via sn_pci_controller, to include a list
+ * of pcidev_info.
+ */
+struct sn_pci_controller {
+	struct pci_controller pci_controller;
+	struct list_head pcidev_info;
+};
+
+#define SN_PCI_CONTROLLER(dev) ((struct sn_pci_controller *) dev->sysdata)
+
+#define SN_PCIDEV_INFO(dev)	sn_pcidev_info_get(dev)
+
+#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \
+	(struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data))
+/*
+ * Given a pci_bus, return the sn pcibus_bussoft struct.  Note that
+ * this only works for root busses, not for busses represented by PPB's.
+ */
+
+#define SN_PCIBUS_BUSSOFT(pci_bus) \
+        ((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data))
+
+#define SN_PCIBUS_BUSSOFT_INFO(pci_bus) \
+	(struct pcibus_info *)((struct pcibus_bussoft *)(PCI_CONTROLLER((pci_bus))->platform_data))
+/*
+ * Given a struct pci_dev, return the sn pcibus_bussoft struct.  Note
+ * that this is not equivalent to SN_PCIBUS_BUSSOFT(pci_dev->bus) due
+ * due to possible PPB's in the path.
+ */
+
+#define SN_PCIDEV_BUSSOFT(pci_dev) \
+	(SN_PCIDEV_INFO(pci_dev)->pdi_host_pcidev_info->pdi_pcibus_info)
+
+#define SN_PCIDEV_BUSPROVIDER(pci_dev) \
+	(SN_PCIDEV_INFO(pci_dev)->pdi_provider)
+
+#define PCIIO_BUS_NONE	255      /* bus 255 reserved */
+#define PCIIO_SLOT_NONE 255
+#define PCIIO_FUNC_NONE 255
+#define PCIIO_VENDOR_ID_NONE	(-1)
+
+struct pcidev_info {
+	u64		pdi_pio_mapped_addr[7]; /* 6 BARs PLUS 1 ROM */
+	u64		pdi_slot_host_handle;	/* Bus and devfn Host pci_dev */
+
+	struct pcibus_bussoft	*pdi_pcibus_info;	/* Kernel common bus soft */
+	struct pcidev_info	*pdi_host_pcidev_info;	/* Kernel Host pci_dev */
+	struct pci_dev		*pdi_linux_pcidev;	/* Kernel pci_dev */
+
+	struct sn_irq_info	*pdi_sn_irq_info;
+	struct sn_pcibus_provider *pdi_provider;	/* sn pci ops */
+	struct pci_dev 		*host_pci_dev;		/* host bus link */
+	struct list_head	pdi_list;		/* List of pcidev_info */
+};
+
+extern void sn_irq_fixup(struct pci_dev *pci_dev,
+			 struct sn_irq_info *sn_irq_info);
+extern void sn_irq_unfixup(struct pci_dev *pci_dev);
+extern struct pcidev_info * sn_pcidev_info_get(struct pci_dev *);
+extern void sn_pci_controller_fixup(int segment, int busnum,
+ 				    struct pci_bus *bus);
+extern void sn_bus_store_sysdata(struct pci_dev *dev);
+extern void sn_bus_free_sysdata(void);
+extern void sn_generate_path(struct pci_bus *pci_bus, char *address);
+extern void sn_pci_fixup_slot(struct pci_dev *dev);
+extern void sn_pci_unfixup_slot(struct pci_dev *dev);
+extern void sn_irq_lh_init(void);
+#endif				/* _ASM_IA64_SN_PCI_PCIDEV_H */
diff -Naurp xen/include/asm-ia64/linux-xen/asm/sn/README.origin xen-redhat/include/asm-ia64/linux-xen/asm/sn/README.origin
--- xen/include/asm-ia64/linux-xen/asm/sn/README.origin
+++ xen-redhat/include/asm-ia64/linux-xen/asm/sn/README.origin
@@ -12,5 +12,6 @@ intr.h			-> linux/include/asm-ia64/sn/in
 io.h			-> linux/include/asm-ia64/sn/io.h
 nodepda.h		-> linux/include/asm-ia64/sn/nodepda.h
 pcibr_provider.h	-> linux/include/asm-ia64/sn/pcibr_provider.h
+pcidev.h		-> linux/include/asm-ia64/sn/pcidev.h
 rw_mmr.h		-> linux/include/asm-ia64/sn/rw_mmr.h
 types.h			-> linux/include/asm-ia64/sn/types.h
diff -Naurp xen/include/asm-ia64/linux-xen/linux/linux-pci.h xen-redhat/include/asm-ia64/linux-xen/linux/linux-pci.h
--- xen/include/asm-ia64/linux-xen/linux/linux-pci.h
+++ xen-redhat/include/asm-ia64/linux-xen/linux/linux-pci.h
@@ -0,0 +1,820 @@
+/*
+ *	pci.h
+ *
+ *	PCI defines and function prototypes
+ *	Copyright 1994, Drew Eckhardt
+ *	Copyright 1997--1999 Martin Mares <mj@ucw.cz>
+ *
+ *	For more information, please consult the following manuals (look at
+ *	http://www.pcisig.com/ for how to get them):
+ *
+ *	PCI BIOS Specification
+ *	PCI Local Bus Specification
+ *	PCI to PCI Bridge Specification
+ *	PCI System Design Guide
+ */
+
+#ifndef LINUX_PCI_H
+#define LINUX_PCI_H
+
+/* Include the pci register defines */
+#include <linux/pci_regs.h>
+
+/* Include the ID list */
+#include <linux/pci_ids.h>
+#ifdef XEN
+#include <asm/processor.h>
+#endif
+
+/*
+ * The PCI interface treats multi-function devices as independent
+ * devices.  The slot/function address of each device is encoded
+ * in a single byte as follows:
+ *
+ *	7:3 = slot
+ *	2:0 = function
+ */
+#define PCI_DEVFN(slot,func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn)		((devfn) & 0x07)
+
+/* Ioctls for /proc/bus/pci/X/Y nodes. */
+#define PCIIOC_BASE		('P' << 24 | 'C' << 16 | 'I' << 8)
+#define PCIIOC_CONTROLLER	(PCIIOC_BASE | 0x00)	/* Get controller for PCI device. */
+#define PCIIOC_MMAP_IS_IO	(PCIIOC_BASE | 0x01)	/* Set mmap state to I/O space. */
+#define PCIIOC_MMAP_IS_MEM	(PCIIOC_BASE | 0x02)	/* Set mmap state to MEM space. */
+#define PCIIOC_WRITE_COMBINE	(PCIIOC_BASE | 0x03)	/* Enable/disable write-combining. */
+
+#ifdef __KERNEL__
+
+#include <linux/mod_devicetable.h>
+
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+
+/* File state for mmap()s on /proc/bus/pci/X/Y */
+enum pci_mmap_state {
+	pci_mmap_io,
+	pci_mmap_mem
+};
+
+/* This defines the direction arg to the DMA mapping routines. */
+#define PCI_DMA_BIDIRECTIONAL	0
+#define PCI_DMA_TODEVICE	1
+#define PCI_DMA_FROMDEVICE	2
+#define PCI_DMA_NONE		3
+
+#define DEVICE_COUNT_COMPATIBLE	4
+#define DEVICE_COUNT_RESOURCE	12
+
+typedef int __bitwise pci_power_t;
+
+#define PCI_D0		((pci_power_t __force) 0)
+#define PCI_D1		((pci_power_t __force) 1)
+#define PCI_D2		((pci_power_t __force) 2)
+#define PCI_D3hot	((pci_power_t __force) 3)
+#define PCI_D3cold	((pci_power_t __force) 4)
+#define PCI_UNKNOWN	((pci_power_t __force) 5)
+#define PCI_POWER_ERROR	((pci_power_t __force) -1)
+
+/** The pci_channel state describes connectivity between the CPU and
+ *  the pci device.  If some PCI bus between here and the pci device
+ *  has crashed or locked up, this info is reflected here.
+ */
+typedef unsigned int __bitwise pci_channel_state_t;
+
+enum pci_channel_state {
+	/* I/O channel is in normal state */
+	pci_channel_io_normal = (__force pci_channel_state_t) 1,
+
+	/* I/O to channel is blocked */
+	pci_channel_io_frozen = (__force pci_channel_state_t) 2,
+
+	/* PCI card is dead */
+	pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
+};
+
+typedef unsigned short __bitwise pci_bus_flags_t;
+enum pci_bus_flags {
+	PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1,
+};
+
+struct pci_cap_saved_state {
+	struct hlist_node next;
+	char cap_nr;
+	u32 data[0];
+};
+
+/*
+ * The pci_dev structure is used to describe PCI devices.
+ */
+struct pci_dev {
+	struct list_head global_list;	/* node in list of all PCI devices */
+	struct list_head bus_list;	/* node in per-bus list */
+	struct pci_bus	*bus;		/* bus this device is on */
+	struct pci_bus	*subordinate;	/* bus this device bridges to */
+
+	void		*sysdata;	/* hook for sys-specific extension */
+	struct proc_dir_entry *procent;	/* device entry in /proc/bus/pci */
+
+	unsigned int	devfn;		/* encoded device & function index */
+	unsigned short	vendor;
+	unsigned short	device;
+	unsigned short	subsystem_vendor;
+	unsigned short	subsystem_device;
+	unsigned int	class;		/* 3 bytes: (base,sub,prog-if) */
+	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
+	u8		rom_base_reg;	/* which config register controls the ROM */
+	u8		pin;  		/* which interrupt pin this device uses */
+
+	struct pci_driver *driver;	/* which driver has allocated this device */
+	u64		dma_mask;	/* Mask of the bits of bus address this
+					   device implements.  Normally this is
+					   0xffffffff.  You only need to change
+					   this if your device has broken DMA
+					   or supports 64-bit transfers.  */
+
+	pci_power_t     current_state;  /* Current operating state. In ACPI-speak,
+					   this is D0-D3, D0 being fully functional,
+					   and D3 being off. */
+
+	pci_channel_state_t error_state;	/* current connectivity state */
+	struct	device	dev;		/* Generic device interface */
+
+	/* device is compatible with these IDs */
+	unsigned short vendor_compatible[DEVICE_COUNT_COMPATIBLE];
+	unsigned short device_compatible[DEVICE_COUNT_COMPATIBLE];
+
+	int		cfg_size;	/* Size of configuration space */
+
+	/*
+	 * Instead of touching interrupt line and base address registers
+	 * directly, use the values stored here. They might be different!
+	 */
+	unsigned int	irq;
+	struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
+
+	/* These fields are used by common fixups */
+	unsigned int	transparent:1;	/* Transparent PCI bridge */
+	unsigned int	multifunction:1;/* Part of multi-function device */
+	/* keep track of device state */
+	unsigned int	is_enabled:1;	/* pci_enable_device has been called */
+	unsigned int	is_busmaster:1; /* device is busmaster */
+	unsigned int	no_msi:1;	/* device may not use msi */
+	unsigned int	no_d1d2:1;   /* only allow d0 or d3 */
+	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
+	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
+	unsigned int 	msi_enabled:1;
+	unsigned int	msix_enabled:1;
+
+	u32		saved_config_space[16]; /* config space saved at suspend time */
+	struct hlist_head saved_cap_space;
+	struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */
+	int rom_attr_enabled;		/* has display of the rom attribute been enabled? */
+	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
+};
+
+#define pci_dev_g(n) list_entry(n, struct pci_dev, global_list)
+#define pci_dev_b(n) list_entry(n, struct pci_dev, bus_list)
+#define	to_pci_dev(n) container_of(n, struct pci_dev, dev)
+#define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL)
+
+static inline struct pci_cap_saved_state *pci_find_saved_cap(
+	struct pci_dev *pci_dev,char cap)
+{
+	struct pci_cap_saved_state *tmp;
+	struct hlist_node *pos;
+
+	hlist_for_each_entry(tmp, pos, &pci_dev->saved_cap_space, next) {
+		if (tmp->cap_nr == cap)
+			return tmp;
+	}
+	return NULL;
+}
+
+static inline void pci_add_saved_cap(struct pci_dev *pci_dev,
+	struct pci_cap_saved_state *new_cap)
+{
+	hlist_add_head(&new_cap->next, &pci_dev->saved_cap_space);
+}
+
+static inline void pci_remove_saved_cap(struct pci_cap_saved_state *cap)
+{
+	hlist_del(&cap->next);
+}
+
+/*
+ *  For PCI devices, the region numbers are assigned this way:
+ *
+ *	0-5	standard PCI regions
+ *	6	expansion ROM
+ *	7-10	bridges: address space assigned to buses behind the bridge
+ */
+
+#define PCI_ROM_RESOURCE	6
+#define PCI_BRIDGE_RESOURCES	7
+#define PCI_NUM_RESOURCES	11
+
+#ifndef PCI_BUS_NUM_RESOURCES
+#define PCI_BUS_NUM_RESOURCES	8
+#endif
+
+#define PCI_REGION_FLAG_MASK	0x0fU	/* These bits of resource flags tell us the PCI region flags */
+
+struct pci_bus {
+	struct list_head node;		/* node in list of buses */
+	struct pci_bus	*parent;	/* parent bus this bridge is on */
+	struct list_head children;	/* list of child buses */
+	struct list_head devices;	/* list of devices on this bus */
+	struct pci_dev	*self;		/* bridge device as seen by parent */
+	struct resource	*resource[PCI_BUS_NUM_RESOURCES];
+					/* address space routed to this bus */
+
+	struct pci_ops	*ops;		/* configuration access functions */
+	void		*sysdata;	/* hook for sys-specific extension */
+	struct proc_dir_entry *procdir;	/* directory entry in /proc/bus/pci */
+
+	unsigned char	number;		/* bus number */
+	unsigned char	primary;	/* number of primary bridge */
+	unsigned char	secondary;	/* number of secondary bridge */
+	unsigned char	subordinate;	/* max number of subordinate buses */
+
+	char		name[48];
+
+	unsigned short  bridge_ctl;	/* manage NO_ISA/FBB/et al behaviors */
+	pci_bus_flags_t bus_flags;	/* Inherited by child busses */
+	struct device		*bridge;
+	struct class_device	class_dev;
+	struct bin_attribute	*legacy_io; /* legacy I/O for this bus */
+	struct bin_attribute	*legacy_mem; /* legacy mem */
+};
+
+#define pci_bus_b(n)	list_entry(n, struct pci_bus, node)
+#define to_pci_bus(n)	container_of(n, struct pci_bus, class_dev)
+
+/*
+ * Error values that may be returned by PCI functions.
+ */
+#define PCIBIOS_SUCCESSFUL		0x00
+#define PCIBIOS_FUNC_NOT_SUPPORTED	0x81
+#define PCIBIOS_BAD_VENDOR_ID		0x83
+#define PCIBIOS_DEVICE_NOT_FOUND	0x86
+#define PCIBIOS_BAD_REGISTER_NUMBER	0x87
+#define PCIBIOS_SET_FAILED		0x88
+#define PCIBIOS_BUFFER_TOO_SMALL	0x89
+
+/* Low-level architecture-dependent routines */
+
+struct pci_ops {
+	int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
+	int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
+};
+
+struct pci_raw_ops {
+	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
+		    int reg, int len, u32 *val);
+	int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn,
+		     int reg, int len, u32 val);
+};
+
+extern struct pci_raw_ops *raw_pci_ops;
+
+struct pci_bus_region {
+	unsigned long start;
+	unsigned long end;
+};
+
+struct pci_dynids {
+	spinlock_t lock;            /* protects list, index */
+	struct list_head list;      /* for IDs added at runtime */
+	unsigned int use_driver_data:1; /* pci_driver->driver_data is used */
+};
+
+/* ---------------------------------------------------------------- */
+/** PCI Error Recovery System (PCI-ERS).  If a PCI device driver provides
+ *  a set fof callbacks in struct pci_error_handlers, then that device driver
+ *  will be notified of PCI bus errors, and will be driven to recovery
+ *  when an error occurs.
+ */
+
+typedef unsigned int __bitwise pci_ers_result_t;
+
+enum pci_ers_result {
+	/* no result/none/not supported in device driver */
+	PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
+
+	/* Device driver can recover without slot reset */
+	PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
+
+	/* Device driver wants slot to be reset. */
+	PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
+
+	/* Device has completely failed, is unrecoverable */
+	PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4,
+
+	/* Device driver is fully recovered and operational */
+	PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5,
+};
+
+/* PCI bus error event callbacks */
+struct pci_error_handlers
+{
+	/* PCI bus error detected on this device */
+	pci_ers_result_t (*error_detected)(struct pci_dev *dev,
+	                      enum pci_channel_state error);
+
+	/* MMIO has been re-enabled, but not DMA */
+	pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
+
+	/* PCI Express link has been reset */
+	pci_ers_result_t (*link_reset)(struct pci_dev *dev);
+
+	/* PCI slot has been reset */
+	pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
+
+	/* Device driver may resume normal operations */
+	void (*resume)(struct pci_dev *dev);
+};
+
+/* ---------------------------------------------------------------- */
+
+struct module;
+struct pci_driver {
+	struct list_head node;
+	char *name;
+	const struct pci_device_id *id_table;	/* must be non-NULL for probe to be called */
+	int  (*probe)  (struct pci_dev *dev, const struct pci_device_id *id);	/* New device inserted */
+	void (*remove) (struct pci_dev *dev);	/* Device removed (NULL if not a hot-plug capable driver) */
+	int  (*suspend) (struct pci_dev *dev, pm_message_t state);	/* Device suspended */
+	int  (*suspend_late) (struct pci_dev *dev, pm_message_t state);
+	int  (*resume_early) (struct pci_dev *dev);
+	int  (*resume) (struct pci_dev *dev);	                /* Device woken up */
+	int  (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable);   /* Enable wake event */
+	void (*shutdown) (struct pci_dev *dev);
+
+	struct pci_error_handlers *err_handler;
+	struct device_driver	driver;
+	struct pci_dynids dynids;
+
+	int multithread_probe;
+};
+
+#define	to_pci_driver(drv) container_of(drv,struct pci_driver, driver)
+
+/**
+ * PCI_DEVICE - macro used to describe a specific pci device
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ *
+ * This macro is used to create a struct pci_device_id that matches a
+ * specific device.  The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID.
+ */
+#define PCI_DEVICE(vend,dev) \
+	.vendor = (vend), .device = (dev), \
+	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
+
+/**
+ * PCI_DEVICE_CLASS - macro used to describe a specific pci device class
+ * @dev_class: the class, subclass, prog-if triple for this device
+ * @dev_class_mask: the class mask for this device
+ *
+ * This macro is used to create a struct pci_device_id that matches a
+ * specific PCI class.  The vendor, device, subvendor, and subdevice
+ * fields will be set to PCI_ANY_ID.
+ */
+#define PCI_DEVICE_CLASS(dev_class,dev_class_mask) \
+	.class = (dev_class), .class_mask = (dev_class_mask), \
+	.vendor = PCI_ANY_ID, .device = PCI_ANY_ID, \
+	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
+
+/*
+ * pci_module_init is obsolete, this stays here till we fix up all usages of it
+ * in the tree.
+ */
+#define pci_module_init	pci_register_driver
+
+/* these external functions are only available when PCI support is enabled */
+#ifdef CONFIG_PCI
+
+extern struct bus_type pci_bus_type;
+
+/* Do NOT directly access these two variables, unless you are arch specific pci
+ * code, or pci core code. */
+extern struct list_head pci_root_buses;	/* list of all known PCI buses */
+extern struct list_head pci_devices;	/* list of all devices */
+
+void pcibios_fixup_bus(struct pci_bus *);
+int __must_check pcibios_enable_device(struct pci_dev *, int mask);
+char *pcibios_setup (char *str);
+
+/* Used only when drivers/pci/setup.c is used */
+void pcibios_align_resource(void *, struct resource *, resource_size_t,
+				resource_size_t);
+void pcibios_update_irq(struct pci_dev *, int irq);
+
+/* Generic PCI functions used internally */
+
+extern struct pci_bus *pci_find_bus(int domain, int busnr);
+void pci_bus_add_devices(struct pci_bus *bus);
+struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
+static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata)
+{
+	struct pci_bus *root_bus;
+	root_bus = pci_scan_bus_parented(NULL, bus, ops, sysdata);
+	if (root_bus)
+		pci_bus_add_devices(root_bus);
+	return root_bus;
+}
+struct pci_bus *pci_create_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
+struct pci_bus * pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr);
+int pci_scan_slot(struct pci_bus *bus, int devfn);
+struct pci_dev * pci_scan_single_device(struct pci_bus *bus, int devfn);
+void pci_device_add(struct pci_dev *dev, struct pci_bus *bus);
+unsigned int pci_scan_child_bus(struct pci_bus *bus);
+int __must_check pci_bus_add_device(struct pci_dev *dev);
+void pci_read_bridge_bases(struct pci_bus *child);
+struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res);
+int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge);
+extern struct pci_dev *pci_dev_get(struct pci_dev *dev);
+extern void pci_dev_put(struct pci_dev *dev);
+extern void pci_remove_bus(struct pci_bus *b);
+extern void pci_remove_bus_device(struct pci_dev *dev);
+extern void pci_stop_bus_device(struct pci_dev *dev);
+void pci_setup_cardbus(struct pci_bus *bus);
+extern void pci_sort_breadthfirst(void);
+
+/* Generic PCI functions exported to card drivers */
+
+struct pci_dev *pci_find_device (unsigned int vendor, unsigned int device, const struct pci_dev *from);
+struct pci_dev *pci_find_device_reverse (unsigned int vendor, unsigned int device, const struct pci_dev *from);
+struct pci_dev *pci_find_slot (unsigned int bus, unsigned int devfn);
+int pci_find_capability (struct pci_dev *dev, int cap);
+int pci_find_next_capability (struct pci_dev *dev, u8 pos, int cap);
+int pci_find_ext_capability (struct pci_dev *dev, int cap);
+struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
+
+struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device,
+				struct pci_dev *from);
+struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device,
+				struct pci_dev *from);
+
+struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device,
+				unsigned int ss_vendor, unsigned int ss_device,
+				struct pci_dev *from);
+struct pci_dev *pci_get_slot (struct pci_bus *bus, unsigned int devfn);
+struct pci_dev *pci_get_bus_and_slot (unsigned int bus, unsigned int devfn);
+struct pci_dev *pci_get_class (unsigned int class, struct pci_dev *from);
+int pci_dev_present(const struct pci_device_id *ids);
+
+int pci_bus_read_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 *val);
+int pci_bus_read_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 *val);
+int pci_bus_read_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 *val);
+int pci_bus_write_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 val);
+int pci_bus_write_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 val);
+int pci_bus_write_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 val);
+
+static inline int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val)
+{
+	return pci_bus_read_config_byte (dev->bus, dev->devfn, where, val);
+}
+static inline int pci_read_config_word(struct pci_dev *dev, int where, u16 *val)
+{
+	return pci_bus_read_config_word (dev->bus, dev->devfn, where, val);
+}
+static inline int pci_read_config_dword(struct pci_dev *dev, int where, u32 *val)
+{
+	return pci_bus_read_config_dword (dev->bus, dev->devfn, where, val);
+}
+static inline int pci_write_config_byte(struct pci_dev *dev, int where, u8 val)
+{
+	return pci_bus_write_config_byte (dev->bus, dev->devfn, where, val);
+}
+static inline int pci_write_config_word(struct pci_dev *dev, int where, u16 val)
+{
+	return pci_bus_write_config_word (dev->bus, dev->devfn, where, val);
+}
+static inline int pci_write_config_dword(struct pci_dev *dev, int where, u32 val)
+{
+	return pci_bus_write_config_dword (dev->bus, dev->devfn, where, val);
+}
+
+int __must_check pci_enable_device(struct pci_dev *dev);
+int __must_check pci_enable_device_bars(struct pci_dev *dev, int mask);
+void pci_disable_device(struct pci_dev *dev);
+void pci_set_master(struct pci_dev *dev);
+#define HAVE_PCI_SET_MWI
+int __must_check pci_set_mwi(struct pci_dev *dev);
+void pci_clear_mwi(struct pci_dev *dev);
+void pci_intx(struct pci_dev *dev, int enable);
+int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
+int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
+void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
+int __must_check pci_assign_resource(struct pci_dev *dev, int i);
+int __must_check pci_assign_resource_fixed(struct pci_dev *dev, int i);
+void pci_restore_bars(struct pci_dev *dev);
+
+/* ROM control related routines */
+void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
+void __iomem __must_check *pci_map_rom_copy(struct pci_dev *pdev, size_t *size);
+void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
+void pci_remove_rom(struct pci_dev *pdev);
+
+/* Power management related routines */
+int pci_save_state(struct pci_dev *dev);
+int pci_restore_state(struct pci_dev *dev);
+int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
+pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
+int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
+
+/* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
+void pci_bus_assign_resources(struct pci_bus *bus);
+void pci_bus_size_bridges(struct pci_bus *bus);
+int pci_claim_resource(struct pci_dev *, int);
+void pci_assign_unassigned_resources(void);
+void pdev_enable_device(struct pci_dev *);
+void pdev_sort_resources(struct pci_dev *, struct resource_list *);
+void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *),
+		    int (*)(struct pci_dev *, u8, u8));
+#define HAVE_PCI_REQ_REGIONS	2
+int __must_check pci_request_regions(struct pci_dev *, const char *);
+void pci_release_regions(struct pci_dev *);
+int __must_check pci_request_region(struct pci_dev *, int, const char *);
+void pci_release_region(struct pci_dev *, int);
+
+/* drivers/pci/bus.c */
+int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
+			struct resource *res, resource_size_t size,
+			resource_size_t align, resource_size_t min,
+			unsigned int type_mask,
+			void (*alignf)(void *, struct resource *,
+				resource_size_t, resource_size_t),
+			void *alignf_data);
+void pci_enable_bridges(struct pci_bus *bus);
+
+/* Proper probing supporting hot-pluggable devices */
+int __must_check __pci_register_driver(struct pci_driver *, struct module *);
+static inline int __must_check pci_register_driver(struct pci_driver *driver)
+{
+	return __pci_register_driver(driver, THIS_MODULE);
+}
+
+void pci_unregister_driver(struct pci_driver *);
+void pci_remove_behind_bridge(struct pci_dev *);
+struct pci_driver *pci_dev_driver(const struct pci_dev *);
+const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev);
+const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev);
+int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass);
+
+void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
+		  void *userdata);
+int pci_cfg_space_size(struct pci_dev *dev);
+unsigned char pci_bus_max_busnr(struct pci_bus* bus);
+
+/* kmem_cache style wrapper around pci_alloc_consistent() */
+
+#include <linux/dmapool.h>
+
+#define	pci_pool dma_pool
+#define pci_pool_create(name, pdev, size, align, allocation) \
+		dma_pool_create(name, &pdev->dev, size, align, allocation)
+#define	pci_pool_destroy(pool) dma_pool_destroy(pool)
+#define	pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
+#define	pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
+
+enum pci_dma_burst_strategy {
+	PCI_DMA_BURST_INFINITY,	/* make bursts as large as possible,
+				   strategy_parameter is N/A */
+	PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter
+				   byte boundaries */
+	PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of
+				   strategy_parameter byte boundaries */
+};
+
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+extern struct pci_dev *isa_bridge;
+#endif
+
+struct msix_entry {
+	u16 	vector;	/* kernel uses to write allocated vector */
+	u16	entry;	/* driver uses to specify entry, OS writes */
+};
+
+
+#ifndef CONFIG_PCI_MSI
+static inline void pci_scan_msi_device(struct pci_dev *dev) {}
+static inline int pci_enable_msi(struct pci_dev *dev) {return -1;}
+static inline void pci_disable_msi(struct pci_dev *dev) {}
+static inline int pci_enable_msix(struct pci_dev* dev,
+	struct msix_entry *entries, int nvec) {return -1;}
+static inline void pci_disable_msix(struct pci_dev *dev) {}
+static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {}
+#else
+extern void pci_scan_msi_device(struct pci_dev *dev);
+extern int pci_enable_msi(struct pci_dev *dev);
+extern void pci_disable_msi(struct pci_dev *dev);
+extern int pci_enable_msix(struct pci_dev* dev,
+	struct msix_entry *entries, int nvec);
+extern void pci_disable_msix(struct pci_dev *dev);
+extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
+#endif
+
+#ifdef CONFIG_HT_IRQ
+/* The functions a driver should call */
+int  ht_create_irq(struct pci_dev *dev, int idx);
+void ht_destroy_irq(unsigned int irq);
+#endif /* CONFIG_HT_IRQ */
+
+extern void pci_block_user_cfg_access(struct pci_dev *dev);
+extern void pci_unblock_user_cfg_access(struct pci_dev *dev);
+
+/*
+ * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
+ * a PCI domain is defined to be a set of PCI busses which share
+ * configuration space.
+ */
+#ifndef CONFIG_PCI_DOMAINS
+static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+	return 0;
+}
+#endif
+
+#else /* CONFIG_PCI is not enabled */
+
+/*
+ *  If the system does not have PCI, clearly these return errors.  Define
+ *  these as simple inline functions to avoid hair in drivers.
+ */
+
+#define _PCI_NOP(o,s,t) \
+	static inline int pci_##o##_config_##s (struct pci_dev *dev, int where, t val) \
+		{ return PCIBIOS_FUNC_NOT_SUPPORTED; }
+#define _PCI_NOP_ALL(o,x)	_PCI_NOP(o,byte,u8 x) \
+				_PCI_NOP(o,word,u16 x) \
+				_PCI_NOP(o,dword,u32 x)
+_PCI_NOP_ALL(read, *)
+_PCI_NOP_ALL(write,)
+
+static inline struct pci_dev *pci_find_device(unsigned int vendor, unsigned int device, const struct pci_dev *from)
+{ return NULL; }
+
+static inline struct pci_dev *pci_find_slot(unsigned int bus, unsigned int devfn)
+{ return NULL; }
+
+static inline struct pci_dev *pci_get_device(unsigned int vendor,
+				unsigned int device, struct pci_dev *from)
+{ return NULL; }
+
+static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor,
+				unsigned int device, struct pci_dev *from)
+{ return NULL; }
+
+static inline struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device,
+unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from)
+{ return NULL; }
+
+static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from)
+{ return NULL; }
+
+#define pci_dev_present(ids)	(0)
+#define pci_dev_put(dev)	do { } while (0)
+
+static inline void pci_set_master(struct pci_dev *dev) { }
+static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; }
+static inline void pci_disable_device(struct pci_dev *dev) { }
+static inline int pci_set_dma_mask(struct pci_dev *dev, u64 mask) { return -EIO; }
+static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY;}
+static inline int __pci_register_driver(struct pci_driver *drv, struct module *owner) { return 0;}
+static inline int pci_register_driver(struct pci_driver *drv) { return 0;}
+static inline void pci_unregister_driver(struct pci_driver *drv) { }
+static inline int pci_find_capability (struct pci_dev *dev, int cap) {return 0; }
+static inline int pci_find_next_capability (struct pci_dev *dev, u8 post, int cap) { return 0; }
+static inline int pci_find_ext_capability (struct pci_dev *dev, int cap) {return 0; }
+static inline const struct pci_device_id *pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev) { return NULL; }
+
+/* Power management related routines */
+static inline int pci_save_state(struct pci_dev *dev) { return 0; }
+static inline int pci_restore_state(struct pci_dev *dev) { return 0; }
+static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return 0; }
+static inline pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) { return PCI_D0; }
+static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) { return 0; }
+
+#define	isa_bridge	((struct pci_dev *)NULL)
+
+#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)
+
+static inline void pci_block_user_cfg_access(struct pci_dev *dev) { }
+static inline void pci_unblock_user_cfg_access(struct pci_dev *dev) { }
+
+#endif /* CONFIG_PCI */
+
+/* Include architecture-dependent settings and functions */
+
+#include <asm/pci.h>
+
+/* these helpers provide future and backwards compatibility
+ * for accessing popular PCI BAR info */
+#define pci_resource_start(dev,bar)   ((dev)->resource[(bar)].start)
+#define pci_resource_end(dev,bar)     ((dev)->resource[(bar)].end)
+#define pci_resource_flags(dev,bar)   ((dev)->resource[(bar)].flags)
+#define pci_resource_len(dev,bar) \
+	((pci_resource_start((dev),(bar)) == 0 &&	\
+	  pci_resource_end((dev),(bar)) ==		\
+	  pci_resource_start((dev),(bar))) ? 0 :	\
+	  						\
+	 (pci_resource_end((dev),(bar)) -		\
+	  pci_resource_start((dev),(bar)) + 1))
+
+/* Similar to the helpers above, these manipulate per-pci_dev
+ * driver-specific data.  They are really just a wrapper around
+ * the generic device structure functions of these calls.
+ */
+static inline void *pci_get_drvdata (struct pci_dev *pdev)
+{
+	return dev_get_drvdata(&pdev->dev);
+}
+
+static inline void pci_set_drvdata (struct pci_dev *pdev, void *data)
+{
+	dev_set_drvdata(&pdev->dev, data);
+}
+
+/* If you want to know what to call your pci_dev, ask this function.
+ * Again, it's a wrapper around the generic device.
+ */
+static inline char *pci_name(struct pci_dev *pdev)
+{
+	return pdev->dev.bus_id;
+}
+
+
+/* Some archs don't want to expose struct resource to userland as-is
+ * in sysfs and /proc
+ */
+#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER
+static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
+                const struct resource *rsrc, resource_size_t *start,
+		resource_size_t *end)
+{
+	*start = rsrc->start;
+	*end = rsrc->end;
+}
+#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */
+
+
+/*
+ *  The world is not perfect and supplies us with broken PCI devices.
+ *  For at least a part of these bugs we need a work-around, so both
+ *  generic (drivers/pci/quirks.c) and per-architecture code can define
+ *  fixup hooks to be called for particular buggy devices.
+ */
+
+struct pci_fixup {
+	u16 vendor, device;	/* You can use PCI_ANY_ID here of course */
+	void (*hook)(struct pci_dev *dev);
+};
+
+enum pci_fixup_pass {
+	pci_fixup_early,	/* Before probing BARs */
+	pci_fixup_header,	/* After reading configuration header */
+	pci_fixup_final,	/* Final phase of device fixups */
+	pci_fixup_enable,	/* pci_enable_device() time */
+};
+
+/* Anonymous variables would be nice... */
+#define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, hook)	\
+	static const struct pci_fixup __pci_fixup_##name __attribute_used__ \
+	__attribute__((__section__(#section))) = { vendor, device, hook };
+#define DECLARE_PCI_FIXUP_EARLY(vendor, device, hook)			\
+	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_early,			\
+			vendor##device##hook, vendor, device, hook)
+#define DECLARE_PCI_FIXUP_HEADER(vendor, device, hook)			\
+	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_header,			\
+			vendor##device##hook, vendor, device, hook)
+#define DECLARE_PCI_FIXUP_FINAL(vendor, device, hook)			\
+	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_final,			\
+			vendor##device##hook, vendor, device, hook)
+#define DECLARE_PCI_FIXUP_ENABLE(vendor, device, hook)			\
+	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_enable,			\
+			vendor##device##hook, vendor, device, hook)
+
+
+void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
+
+extern int pci_pci_problems;
+#define PCIPCI_FAIL		1	/* No PCI PCI DMA */
+#define PCIPCI_TRITON		2
+#define PCIPCI_NATOMA		4
+#define PCIPCI_VIAETBF		8
+#define PCIPCI_VSFX		16
+#define PCIPCI_ALIMAGIK		32	/* Need low latency setting */
+#define PCIAGP_FAIL		64	/* No PCI to AGP DMA */
+
+#endif /* __KERNEL__ */
+#endif /* LINUX_PCI_H */
diff -Naurp xen/include/asm-ia64/linux-xen/linux/pci.h xen-redhat/include/asm-ia64/linux-xen/linux/pci.h
--- xen/include/asm-ia64/linux-xen/linux/pci.h
+++ xen-redhat/include/asm-ia64/linux-xen/linux/pci.h
@@ -1,820 +0,0 @@
-/*
- *	pci.h
- *
- *	PCI defines and function prototypes
- *	Copyright 1994, Drew Eckhardt
- *	Copyright 1997--1999 Martin Mares <mj@ucw.cz>
- *
- *	For more information, please consult the following manuals (look at
- *	http://www.pcisig.com/ for how to get them):
- *
- *	PCI BIOS Specification
- *	PCI Local Bus Specification
- *	PCI to PCI Bridge Specification
- *	PCI System Design Guide
- */
-
-#ifndef LINUX_PCI_H
-#define LINUX_PCI_H
-
-/* Include the pci register defines */
-#include <linux/pci_regs.h>
-
-/* Include the ID list */
-#include <linux/pci_ids.h>
-#ifdef XEN
-#include <asm/processor.h>
-#endif
-
-/*
- * The PCI interface treats multi-function devices as independent
- * devices.  The slot/function address of each device is encoded
- * in a single byte as follows:
- *
- *	7:3 = slot
- *	2:0 = function
- */
-#define PCI_DEVFN(slot,func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
-#define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
-#define PCI_FUNC(devfn)		((devfn) & 0x07)
-
-/* Ioctls for /proc/bus/pci/X/Y nodes. */
-#define PCIIOC_BASE		('P' << 24 | 'C' << 16 | 'I' << 8)
-#define PCIIOC_CONTROLLER	(PCIIOC_BASE | 0x00)	/* Get controller for PCI device. */
-#define PCIIOC_MMAP_IS_IO	(PCIIOC_BASE | 0x01)	/* Set mmap state to I/O space. */
-#define PCIIOC_MMAP_IS_MEM	(PCIIOC_BASE | 0x02)	/* Set mmap state to MEM space. */
-#define PCIIOC_WRITE_COMBINE	(PCIIOC_BASE | 0x03)	/* Enable/disable write-combining. */
-
-#ifdef __KERNEL__
-
-#include <linux/mod_devicetable.h>
-
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/list.h>
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/device.h>
-
-/* File state for mmap()s on /proc/bus/pci/X/Y */
-enum pci_mmap_state {
-	pci_mmap_io,
-	pci_mmap_mem
-};
-
-/* This defines the direction arg to the DMA mapping routines. */
-#define PCI_DMA_BIDIRECTIONAL	0
-#define PCI_DMA_TODEVICE	1
-#define PCI_DMA_FROMDEVICE	2
-#define PCI_DMA_NONE		3
-
-#define DEVICE_COUNT_COMPATIBLE	4
-#define DEVICE_COUNT_RESOURCE	12
-
-typedef int __bitwise pci_power_t;
-
-#define PCI_D0		((pci_power_t __force) 0)
-#define PCI_D1		((pci_power_t __force) 1)
-#define PCI_D2		((pci_power_t __force) 2)
-#define PCI_D3hot	((pci_power_t __force) 3)
-#define PCI_D3cold	((pci_power_t __force) 4)
-#define PCI_UNKNOWN	((pci_power_t __force) 5)
-#define PCI_POWER_ERROR	((pci_power_t __force) -1)
-
-/** The pci_channel state describes connectivity between the CPU and
- *  the pci device.  If some PCI bus between here and the pci device
- *  has crashed or locked up, this info is reflected here.
- */
-typedef unsigned int __bitwise pci_channel_state_t;
-
-enum pci_channel_state {
-	/* I/O channel is in normal state */
-	pci_channel_io_normal = (__force pci_channel_state_t) 1,
-
-	/* I/O to channel is blocked */
-	pci_channel_io_frozen = (__force pci_channel_state_t) 2,
-
-	/* PCI card is dead */
-	pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
-};
-
-typedef unsigned short __bitwise pci_bus_flags_t;
-enum pci_bus_flags {
-	PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1,
-};
-
-struct pci_cap_saved_state {
-	struct hlist_node next;
-	char cap_nr;
-	u32 data[0];
-};
-
-/*
- * The pci_dev structure is used to describe PCI devices.
- */
-struct pci_dev {
-	struct list_head global_list;	/* node in list of all PCI devices */
-	struct list_head bus_list;	/* node in per-bus list */
-	struct pci_bus	*bus;		/* bus this device is on */
-	struct pci_bus	*subordinate;	/* bus this device bridges to */
-
-	void		*sysdata;	/* hook for sys-specific extension */
-	struct proc_dir_entry *procent;	/* device entry in /proc/bus/pci */
-
-	unsigned int	devfn;		/* encoded device & function index */
-	unsigned short	vendor;
-	unsigned short	device;
-	unsigned short	subsystem_vendor;
-	unsigned short	subsystem_device;
-	unsigned int	class;		/* 3 bytes: (base,sub,prog-if) */
-	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
-	u8		rom_base_reg;	/* which config register controls the ROM */
-	u8		pin;  		/* which interrupt pin this device uses */
-
-	struct pci_driver *driver;	/* which driver has allocated this device */
-	u64		dma_mask;	/* Mask of the bits of bus address this
-					   device implements.  Normally this is
-					   0xffffffff.  You only need to change
-					   this if your device has broken DMA
-					   or supports 64-bit transfers.  */
-
-	pci_power_t     current_state;  /* Current operating state. In ACPI-speak,
-					   this is D0-D3, D0 being fully functional,
-					   and D3 being off. */
-
-	pci_channel_state_t error_state;	/* current connectivity state */
-	struct	device	dev;		/* Generic device interface */
-
-	/* device is compatible with these IDs */
-	unsigned short vendor_compatible[DEVICE_COUNT_COMPATIBLE];
-	unsigned short device_compatible[DEVICE_COUNT_COMPATIBLE];
-
-	int		cfg_size;	/* Size of configuration space */
-
-	/*
-	 * Instead of touching interrupt line and base address registers
-	 * directly, use the values stored here. They might be different!
-	 */
-	unsigned int	irq;
-	struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
-
-	/* These fields are used by common fixups */
-	unsigned int	transparent:1;	/* Transparent PCI bridge */
-	unsigned int	multifunction:1;/* Part of multi-function device */
-	/* keep track of device state */
-	unsigned int	is_enabled:1;	/* pci_enable_device has been called */
-	unsigned int	is_busmaster:1; /* device is busmaster */
-	unsigned int	no_msi:1;	/* device may not use msi */
-	unsigned int	no_d1d2:1;   /* only allow d0 or d3 */
-	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
-	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
-	unsigned int 	msi_enabled:1;
-	unsigned int	msix_enabled:1;
-
-	u32		saved_config_space[16]; /* config space saved at suspend time */
-	struct hlist_head saved_cap_space;
-	struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */
-	int rom_attr_enabled;		/* has display of the rom attribute been enabled? */
-	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
-};
-
-#define pci_dev_g(n) list_entry(n, struct pci_dev, global_list)
-#define pci_dev_b(n) list_entry(n, struct pci_dev, bus_list)
-#define	to_pci_dev(n) container_of(n, struct pci_dev, dev)
-#define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL)
-
-static inline struct pci_cap_saved_state *pci_find_saved_cap(
-	struct pci_dev *pci_dev,char cap)
-{
-	struct pci_cap_saved_state *tmp;
-	struct hlist_node *pos;
-
-	hlist_for_each_entry(tmp, pos, &pci_dev->saved_cap_space, next) {
-		if (tmp->cap_nr == cap)
-			return tmp;
-	}
-	return NULL;
-}
-
-static inline void pci_add_saved_cap(struct pci_dev *pci_dev,
-	struct pci_cap_saved_state *new_cap)
-{
-	hlist_add_head(&new_cap->next, &pci_dev->saved_cap_space);
-}
-
-static inline void pci_remove_saved_cap(struct pci_cap_saved_state *cap)
-{
-	hlist_del(&cap->next);
-}
-
-/*
- *  For PCI devices, the region numbers are assigned this way:
- *
- *	0-5	standard PCI regions
- *	6	expansion ROM
- *	7-10	bridges: address space assigned to buses behind the bridge
- */
-
-#define PCI_ROM_RESOURCE	6
-#define PCI_BRIDGE_RESOURCES	7
-#define PCI_NUM_RESOURCES	11
-
-#ifndef PCI_BUS_NUM_RESOURCES
-#define PCI_BUS_NUM_RESOURCES	8
-#endif
-
-#define PCI_REGION_FLAG_MASK	0x0fU	/* These bits of resource flags tell us the PCI region flags */
-
-struct pci_bus {
-	struct list_head node;		/* node in list of buses */
-	struct pci_bus	*parent;	/* parent bus this bridge is on */
-	struct list_head children;	/* list of child buses */
-	struct list_head devices;	/* list of devices on this bus */
-	struct pci_dev	*self;		/* bridge device as seen by parent */
-	struct resource	*resource[PCI_BUS_NUM_RESOURCES];
-					/* address space routed to this bus */
-
-	struct pci_ops	*ops;		/* configuration access functions */
-	void		*sysdata;	/* hook for sys-specific extension */
-	struct proc_dir_entry *procdir;	/* directory entry in /proc/bus/pci */
-
-	unsigned char	number;		/* bus number */
-	unsigned char	primary;	/* number of primary bridge */
-	unsigned char	secondary;	/* number of secondary bridge */
-	unsigned char	subordinate;	/* max number of subordinate buses */
-
-	char		name[48];
-
-	unsigned short  bridge_ctl;	/* manage NO_ISA/FBB/et al behaviors */
-	pci_bus_flags_t bus_flags;	/* Inherited by child busses */
-	struct device		*bridge;
-	struct class_device	class_dev;
-	struct bin_attribute	*legacy_io; /* legacy I/O for this bus */
-	struct bin_attribute	*legacy_mem; /* legacy mem */
-};
-
-#define pci_bus_b(n)	list_entry(n, struct pci_bus, node)
-#define to_pci_bus(n)	container_of(n, struct pci_bus, class_dev)
-
-/*
- * Error values that may be returned by PCI functions.
- */
-#define PCIBIOS_SUCCESSFUL		0x00
-#define PCIBIOS_FUNC_NOT_SUPPORTED	0x81
-#define PCIBIOS_BAD_VENDOR_ID		0x83
-#define PCIBIOS_DEVICE_NOT_FOUND	0x86
-#define PCIBIOS_BAD_REGISTER_NUMBER	0x87
-#define PCIBIOS_SET_FAILED		0x88
-#define PCIBIOS_BUFFER_TOO_SMALL	0x89
-
-/* Low-level architecture-dependent routines */
-
-struct pci_ops {
-	int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
-	int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
-};
-
-struct pci_raw_ops {
-	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
-		    int reg, int len, u32 *val);
-	int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn,
-		     int reg, int len, u32 val);
-};
-
-extern struct pci_raw_ops *raw_pci_ops;
-
-struct pci_bus_region {
-	unsigned long start;
-	unsigned long end;
-};
-
-struct pci_dynids {
-	spinlock_t lock;            /* protects list, index */
-	struct list_head list;      /* for IDs added at runtime */
-	unsigned int use_driver_data:1; /* pci_driver->driver_data is used */
-};
-
-/* ---------------------------------------------------------------- */
-/** PCI Error Recovery System (PCI-ERS).  If a PCI device driver provides
- *  a set fof callbacks in struct pci_error_handlers, then that device driver
- *  will be notified of PCI bus errors, and will be driven to recovery
- *  when an error occurs.
- */
-
-typedef unsigned int __bitwise pci_ers_result_t;
-
-enum pci_ers_result {
-	/* no result/none/not supported in device driver */
-	PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
-
-	/* Device driver can recover without slot reset */
-	PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
-
-	/* Device driver wants slot to be reset. */
-	PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
-
-	/* Device has completely failed, is unrecoverable */
-	PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4,
-
-	/* Device driver is fully recovered and operational */
-	PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5,
-};
-
-/* PCI bus error event callbacks */
-struct pci_error_handlers
-{
-	/* PCI bus error detected on this device */
-	pci_ers_result_t (*error_detected)(struct pci_dev *dev,
-	                      enum pci_channel_state error);
-
-	/* MMIO has been re-enabled, but not DMA */
-	pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
-
-	/* PCI Express link has been reset */
-	pci_ers_result_t (*link_reset)(struct pci_dev *dev);
-
-	/* PCI slot has been reset */
-	pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
-
-	/* Device driver may resume normal operations */
-	void (*resume)(struct pci_dev *dev);
-};
-
-/* ---------------------------------------------------------------- */
-
-struct module;
-struct pci_driver {
-	struct list_head node;
-	char *name;
-	const struct pci_device_id *id_table;	/* must be non-NULL for probe to be called */
-	int  (*probe)  (struct pci_dev *dev, const struct pci_device_id *id);	/* New device inserted */
-	void (*remove) (struct pci_dev *dev);	/* Device removed (NULL if not a hot-plug capable driver) */
-	int  (*suspend) (struct pci_dev *dev, pm_message_t state);	/* Device suspended */
-	int  (*suspend_late) (struct pci_dev *dev, pm_message_t state);
-	int  (*resume_early) (struct pci_dev *dev);
-	int  (*resume) (struct pci_dev *dev);	                /* Device woken up */
-	int  (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable);   /* Enable wake event */
-	void (*shutdown) (struct pci_dev *dev);
-
-	struct pci_error_handlers *err_handler;
-	struct device_driver	driver;
-	struct pci_dynids dynids;
-
-	int multithread_probe;
-};
-
-#define	to_pci_driver(drv) container_of(drv,struct pci_driver, driver)
-
-/**
- * PCI_DEVICE - macro used to describe a specific pci device
- * @vend: the 16 bit PCI Vendor ID
- * @dev: the 16 bit PCI Device ID
- *
- * This macro is used to create a struct pci_device_id that matches a
- * specific device.  The subvendor and subdevice fields will be set to
- * PCI_ANY_ID.
- */
-#define PCI_DEVICE(vend,dev) \
-	.vendor = (vend), .device = (dev), \
-	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
-
-/**
- * PCI_DEVICE_CLASS - macro used to describe a specific pci device class
- * @dev_class: the class, subclass, prog-if triple for this device
- * @dev_class_mask: the class mask for this device
- *
- * This macro is used to create a struct pci_device_id that matches a
- * specific PCI class.  The vendor, device, subvendor, and subdevice
- * fields will be set to PCI_ANY_ID.
- */
-#define PCI_DEVICE_CLASS(dev_class,dev_class_mask) \
-	.class = (dev_class), .class_mask = (dev_class_mask), \
-	.vendor = PCI_ANY_ID, .device = PCI_ANY_ID, \
-	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
-
-/*
- * pci_module_init is obsolete, this stays here till we fix up all usages of it
- * in the tree.
- */
-#define pci_module_init	pci_register_driver
-
-/* these external functions are only available when PCI support is enabled */
-#ifdef CONFIG_PCI
-
-extern struct bus_type pci_bus_type;
-
-/* Do NOT directly access these two variables, unless you are arch specific pci
- * code, or pci core code. */
-extern struct list_head pci_root_buses;	/* list of all known PCI buses */
-extern struct list_head pci_devices;	/* list of all devices */
-
-void pcibios_fixup_bus(struct pci_bus *);
-int __must_check pcibios_enable_device(struct pci_dev *, int mask);
-char *pcibios_setup (char *str);
-
-/* Used only when drivers/pci/setup.c is used */
-void pcibios_align_resource(void *, struct resource *, resource_size_t,
-				resource_size_t);
-void pcibios_update_irq(struct pci_dev *, int irq);
-
-/* Generic PCI functions used internally */
-
-extern struct pci_bus *pci_find_bus(int domain, int busnr);
-void pci_bus_add_devices(struct pci_bus *bus);
-struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
-static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata)
-{
-	struct pci_bus *root_bus;
-	root_bus = pci_scan_bus_parented(NULL, bus, ops, sysdata);
-	if (root_bus)
-		pci_bus_add_devices(root_bus);
-	return root_bus;
-}
-struct pci_bus *pci_create_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
-struct pci_bus * pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr);
-int pci_scan_slot(struct pci_bus *bus, int devfn);
-struct pci_dev * pci_scan_single_device(struct pci_bus *bus, int devfn);
-void pci_device_add(struct pci_dev *dev, struct pci_bus *bus);
-unsigned int pci_scan_child_bus(struct pci_bus *bus);
-int __must_check pci_bus_add_device(struct pci_dev *dev);
-void pci_read_bridge_bases(struct pci_bus *child);
-struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res);
-int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge);
-extern struct pci_dev *pci_dev_get(struct pci_dev *dev);
-extern void pci_dev_put(struct pci_dev *dev);
-extern void pci_remove_bus(struct pci_bus *b);
-extern void pci_remove_bus_device(struct pci_dev *dev);
-extern void pci_stop_bus_device(struct pci_dev *dev);
-void pci_setup_cardbus(struct pci_bus *bus);
-extern void pci_sort_breadthfirst(void);
-
-/* Generic PCI functions exported to card drivers */
-
-struct pci_dev *pci_find_device (unsigned int vendor, unsigned int device, const struct pci_dev *from);
-struct pci_dev *pci_find_device_reverse (unsigned int vendor, unsigned int device, const struct pci_dev *from);
-struct pci_dev *pci_find_slot (unsigned int bus, unsigned int devfn);
-int pci_find_capability (struct pci_dev *dev, int cap);
-int pci_find_next_capability (struct pci_dev *dev, u8 pos, int cap);
-int pci_find_ext_capability (struct pci_dev *dev, int cap);
-struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
-
-struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device,
-				struct pci_dev *from);
-struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device,
-				struct pci_dev *from);
-
-struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device,
-				unsigned int ss_vendor, unsigned int ss_device,
-				struct pci_dev *from);
-struct pci_dev *pci_get_slot (struct pci_bus *bus, unsigned int devfn);
-struct pci_dev *pci_get_bus_and_slot (unsigned int bus, unsigned int devfn);
-struct pci_dev *pci_get_class (unsigned int class, struct pci_dev *from);
-int pci_dev_present(const struct pci_device_id *ids);
-
-int pci_bus_read_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 *val);
-int pci_bus_read_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 *val);
-int pci_bus_read_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 *val);
-int pci_bus_write_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 val);
-int pci_bus_write_config_word (struct pci_bus *bus, unsigned int devfn, int where, u16 val);
-int pci_bus_write_config_dword (struct pci_bus *bus, unsigned int devfn, int where, u32 val);
-
-static inline int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val)
-{
-	return pci_bus_read_config_byte (dev->bus, dev->devfn, where, val);
-}
-static inline int pci_read_config_word(struct pci_dev *dev, int where, u16 *val)
-{
-	return pci_bus_read_config_word (dev->bus, dev->devfn, where, val);
-}
-static inline int pci_read_config_dword(struct pci_dev *dev, int where, u32 *val)
-{
-	return pci_bus_read_config_dword (dev->bus, dev->devfn, where, val);
-}
-static inline int pci_write_config_byte(struct pci_dev *dev, int where, u8 val)
-{
-	return pci_bus_write_config_byte (dev->bus, dev->devfn, where, val);
-}
-static inline int pci_write_config_word(struct pci_dev *dev, int where, u16 val)
-{
-	return pci_bus_write_config_word (dev->bus, dev->devfn, where, val);
-}
-static inline int pci_write_config_dword(struct pci_dev *dev, int where, u32 val)
-{
-	return pci_bus_write_config_dword (dev->bus, dev->devfn, where, val);
-}
-
-int __must_check pci_enable_device(struct pci_dev *dev);
-int __must_check pci_enable_device_bars(struct pci_dev *dev, int mask);
-void pci_disable_device(struct pci_dev *dev);
-void pci_set_master(struct pci_dev *dev);
-#define HAVE_PCI_SET_MWI
-int __must_check pci_set_mwi(struct pci_dev *dev);
-void pci_clear_mwi(struct pci_dev *dev);
-void pci_intx(struct pci_dev *dev, int enable);
-int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
-int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
-void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
-int __must_check pci_assign_resource(struct pci_dev *dev, int i);
-int __must_check pci_assign_resource_fixed(struct pci_dev *dev, int i);
-void pci_restore_bars(struct pci_dev *dev);
-
-/* ROM control related routines */
-void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
-void __iomem __must_check *pci_map_rom_copy(struct pci_dev *pdev, size_t *size);
-void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
-void pci_remove_rom(struct pci_dev *pdev);
-
-/* Power management related routines */
-int pci_save_state(struct pci_dev *dev);
-int pci_restore_state(struct pci_dev *dev);
-int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
-pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
-int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
-
-/* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
-void pci_bus_assign_resources(struct pci_bus *bus);
-void pci_bus_size_bridges(struct pci_bus *bus);
-int pci_claim_resource(struct pci_dev *, int);
-void pci_assign_unassigned_resources(void);
-void pdev_enable_device(struct pci_dev *);
-void pdev_sort_resources(struct pci_dev *, struct resource_list *);
-void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *),
-		    int (*)(struct pci_dev *, u8, u8));
-#define HAVE_PCI_REQ_REGIONS	2
-int __must_check pci_request_regions(struct pci_dev *, const char *);
-void pci_release_regions(struct pci_dev *);
-int __must_check pci_request_region(struct pci_dev *, int, const char *);
-void pci_release_region(struct pci_dev *, int);
-
-/* drivers/pci/bus.c */
-int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
-			struct resource *res, resource_size_t size,
-			resource_size_t align, resource_size_t min,
-			unsigned int type_mask,
-			void (*alignf)(void *, struct resource *,
-				resource_size_t, resource_size_t),
-			void *alignf_data);
-void pci_enable_bridges(struct pci_bus *bus);
-
-/* Proper probing supporting hot-pluggable devices */
-int __must_check __pci_register_driver(struct pci_driver *, struct module *);
-static inline int __must_check pci_register_driver(struct pci_driver *driver)
-{
-	return __pci_register_driver(driver, THIS_MODULE);
-}
-
-void pci_unregister_driver(struct pci_driver *);
-void pci_remove_behind_bridge(struct pci_dev *);
-struct pci_driver *pci_dev_driver(const struct pci_dev *);
-const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev);
-const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev);
-int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass);
-
-void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
-		  void *userdata);
-int pci_cfg_space_size(struct pci_dev *dev);
-unsigned char pci_bus_max_busnr(struct pci_bus* bus);
-
-/* kmem_cache style wrapper around pci_alloc_consistent() */
-
-#include <linux/dmapool.h>
-
-#define	pci_pool dma_pool
-#define pci_pool_create(name, pdev, size, align, allocation) \
-		dma_pool_create(name, &pdev->dev, size, align, allocation)
-#define	pci_pool_destroy(pool) dma_pool_destroy(pool)
-#define	pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
-#define	pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
-
-enum pci_dma_burst_strategy {
-	PCI_DMA_BURST_INFINITY,	/* make bursts as large as possible,
-				   strategy_parameter is N/A */
-	PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter
-				   byte boundaries */
-	PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of
-				   strategy_parameter byte boundaries */
-};
-
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-extern struct pci_dev *isa_bridge;
-#endif
-
-struct msix_entry {
-	u16 	vector;	/* kernel uses to write allocated vector */
-	u16	entry;	/* driver uses to specify entry, OS writes */
-};
-
-
-#ifndef CONFIG_PCI_MSI
-static inline void pci_scan_msi_device(struct pci_dev *dev) {}
-static inline int pci_enable_msi(struct pci_dev *dev) {return -1;}
-static inline void pci_disable_msi(struct pci_dev *dev) {}
-static inline int pci_enable_msix(struct pci_dev* dev,
-	struct msix_entry *entries, int nvec) {return -1;}
-static inline void pci_disable_msix(struct pci_dev *dev) {}
-static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {}
-#else
-extern void pci_scan_msi_device(struct pci_dev *dev);
-extern int pci_enable_msi(struct pci_dev *dev);
-extern void pci_disable_msi(struct pci_dev *dev);
-extern int pci_enable_msix(struct pci_dev* dev,
-	struct msix_entry *entries, int nvec);
-extern void pci_disable_msix(struct pci_dev *dev);
-extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
-#endif
-
-#ifdef CONFIG_HT_IRQ
-/* The functions a driver should call */
-int  ht_create_irq(struct pci_dev *dev, int idx);
-void ht_destroy_irq(unsigned int irq);
-#endif /* CONFIG_HT_IRQ */
-
-extern void pci_block_user_cfg_access(struct pci_dev *dev);
-extern void pci_unblock_user_cfg_access(struct pci_dev *dev);
-
-/*
- * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
- * a PCI domain is defined to be a set of PCI busses which share
- * configuration space.
- */
-#ifndef CONFIG_PCI_DOMAINS
-static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	return 0;
-}
-#endif
-
-#else /* CONFIG_PCI is not enabled */
-
-/*
- *  If the system does not have PCI, clearly these return errors.  Define
- *  these as simple inline functions to avoid hair in drivers.
- */
-
-#define _PCI_NOP(o,s,t) \
-	static inline int pci_##o##_config_##s (struct pci_dev *dev, int where, t val) \
-		{ return PCIBIOS_FUNC_NOT_SUPPORTED; }
-#define _PCI_NOP_ALL(o,x)	_PCI_NOP(o,byte,u8 x) \
-				_PCI_NOP(o,word,u16 x) \
-				_PCI_NOP(o,dword,u32 x)
-_PCI_NOP_ALL(read, *)
-_PCI_NOP_ALL(write,)
-
-static inline struct pci_dev *pci_find_device(unsigned int vendor, unsigned int device, const struct pci_dev *from)
-{ return NULL; }
-
-static inline struct pci_dev *pci_find_slot(unsigned int bus, unsigned int devfn)
-{ return NULL; }
-
-static inline struct pci_dev *pci_get_device(unsigned int vendor,
-				unsigned int device, struct pci_dev *from)
-{ return NULL; }
-
-static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor,
-				unsigned int device, struct pci_dev *from)
-{ return NULL; }
-
-static inline struct pci_dev *pci_get_subsys (unsigned int vendor, unsigned int device,
-unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from)
-{ return NULL; }
-
-static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from)
-{ return NULL; }
-
-#define pci_dev_present(ids)	(0)
-#define pci_dev_put(dev)	do { } while (0)
-
-static inline void pci_set_master(struct pci_dev *dev) { }
-static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; }
-static inline void pci_disable_device(struct pci_dev *dev) { }
-static inline int pci_set_dma_mask(struct pci_dev *dev, u64 mask) { return -EIO; }
-static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY;}
-static inline int __pci_register_driver(struct pci_driver *drv, struct module *owner) { return 0;}
-static inline int pci_register_driver(struct pci_driver *drv) { return 0;}
-static inline void pci_unregister_driver(struct pci_driver *drv) { }
-static inline int pci_find_capability (struct pci_dev *dev, int cap) {return 0; }
-static inline int pci_find_next_capability (struct pci_dev *dev, u8 post, int cap) { return 0; }
-static inline int pci_find_ext_capability (struct pci_dev *dev, int cap) {return 0; }
-static inline const struct pci_device_id *pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev) { return NULL; }
-
-/* Power management related routines */
-static inline int pci_save_state(struct pci_dev *dev) { return 0; }
-static inline int pci_restore_state(struct pci_dev *dev) { return 0; }
-static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return 0; }
-static inline pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) { return PCI_D0; }
-static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) { return 0; }
-
-#define	isa_bridge	((struct pci_dev *)NULL)
-
-#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)
-
-static inline void pci_block_user_cfg_access(struct pci_dev *dev) { }
-static inline void pci_unblock_user_cfg_access(struct pci_dev *dev) { }
-
-#endif /* CONFIG_PCI */
-
-/* Include architecture-dependent settings and functions */
-
-#include <asm/pci.h>
-
-/* these helpers provide future and backwards compatibility
- * for accessing popular PCI BAR info */
-#define pci_resource_start(dev,bar)   ((dev)->resource[(bar)].start)
-#define pci_resource_end(dev,bar)     ((dev)->resource[(bar)].end)
-#define pci_resource_flags(dev,bar)   ((dev)->resource[(bar)].flags)
-#define pci_resource_len(dev,bar) \
-	((pci_resource_start((dev),(bar)) == 0 &&	\
-	  pci_resource_end((dev),(bar)) ==		\
-	  pci_resource_start((dev),(bar))) ? 0 :	\
-	  						\
-	 (pci_resource_end((dev),(bar)) -		\
-	  pci_resource_start((dev),(bar)) + 1))
-
-/* Similar to the helpers above, these manipulate per-pci_dev
- * driver-specific data.  They are really just a wrapper around
- * the generic device structure functions of these calls.
- */
-static inline void *pci_get_drvdata (struct pci_dev *pdev)
-{
-	return dev_get_drvdata(&pdev->dev);
-}
-
-static inline void pci_set_drvdata (struct pci_dev *pdev, void *data)
-{
-	dev_set_drvdata(&pdev->dev, data);
-}
-
-/* If you want to know what to call your pci_dev, ask this function.
- * Again, it's a wrapper around the generic device.
- */
-static inline char *pci_name(struct pci_dev *pdev)
-{
-	return pdev->dev.bus_id;
-}
-
-
-/* Some archs don't want to expose struct resource to userland as-is
- * in sysfs and /proc
- */
-#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER
-static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
-                const struct resource *rsrc, resource_size_t *start,
-		resource_size_t *end)
-{
-	*start = rsrc->start;
-	*end = rsrc->end;
-}
-#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */
-
-
-/*
- *  The world is not perfect and supplies us with broken PCI devices.
- *  For at least a part of these bugs we need a work-around, so both
- *  generic (drivers/pci/quirks.c) and per-architecture code can define
- *  fixup hooks to be called for particular buggy devices.
- */
-
-struct pci_fixup {
-	u16 vendor, device;	/* You can use PCI_ANY_ID here of course */
-	void (*hook)(struct pci_dev *dev);
-};
-
-enum pci_fixup_pass {
-	pci_fixup_early,	/* Before probing BARs */
-	pci_fixup_header,	/* After reading configuration header */
-	pci_fixup_final,	/* Final phase of device fixups */
-	pci_fixup_enable,	/* pci_enable_device() time */
-};
-
-/* Anonymous variables would be nice... */
-#define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, hook)	\
-	static const struct pci_fixup __pci_fixup_##name __attribute_used__ \
-	__attribute__((__section__(#section))) = { vendor, device, hook };
-#define DECLARE_PCI_FIXUP_EARLY(vendor, device, hook)			\
-	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_early,			\
-			vendor##device##hook, vendor, device, hook)
-#define DECLARE_PCI_FIXUP_HEADER(vendor, device, hook)			\
-	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_header,			\
-			vendor##device##hook, vendor, device, hook)
-#define DECLARE_PCI_FIXUP_FINAL(vendor, device, hook)			\
-	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_final,			\
-			vendor##device##hook, vendor, device, hook)
-#define DECLARE_PCI_FIXUP_ENABLE(vendor, device, hook)			\
-	DECLARE_PCI_FIXUP_SECTION(.pci_fixup_enable,			\
-			vendor##device##hook, vendor, device, hook)
-
-
-void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
-
-extern int pci_pci_problems;
-#define PCIPCI_FAIL		1	/* No PCI PCI DMA */
-#define PCIPCI_TRITON		2
-#define PCIPCI_NATOMA		4
-#define PCIPCI_VIAETBF		8
-#define PCIPCI_VSFX		16
-#define PCIPCI_ALIMAGIK		32	/* Need low latency setting */
-#define PCIAGP_FAIL		64	/* No PCI to AGP DMA */
-
-#endif /* __KERNEL__ */
-#endif /* LINUX_PCI_H */
diff -Naurp xen/include/asm-ia64/mm.h xen-redhat/include/asm-ia64/mm.h
--- xen/include/asm-ia64/mm.h
+++ xen-redhat/include/asm-ia64/mm.h
@@ -417,7 +417,7 @@ extern unsigned long totalram_pages;
 extern int nr_swap_pages;
 
 extern void alloc_dom_xen_and_dom_io(void);
-extern void mm_teardown(struct domain* d);
+extern int mm_teardown(struct domain* d);
 extern void mm_final_teardown(struct domain* d);
 extern struct page_info * assign_new_domain_page(struct domain *d, unsigned long mpaddr);
 extern void assign_new_domain0_page(struct domain *d, unsigned long mpaddr);
@@ -508,4 +508,6 @@ int steal_page(
 
 #define domain_get_maximum_gpfn(d) (-ENOSYS)
 
+extern struct domain *dom_xen, *dom_io;	/* for vmcoreinfo */
+
 #endif /* __ASM_IA64_MM_H__ */
diff -Naurp xen/include/asm-ia64/regionreg.h xen-redhat/include/asm-ia64/regionreg.h
--- xen/include/asm-ia64/regionreg.h
+++ xen-redhat/include/asm-ia64/regionreg.h
@@ -76,7 +76,8 @@ extern int deallocate_rid_range(struct d
 struct vcpu;
 extern void init_all_rr(struct vcpu *v);
 
-extern int set_metaphysical_rr0(void);
+extern void set_virtual_rr0(void);
+extern void set_metaphysical_rr0(void);
 
 extern void load_region_regs(struct vcpu *v);
 
diff -Naurp xen/include/asm-ia64/shadow.h xen-redhat/include/asm-ia64/shadow.h
--- xen/include/asm-ia64/shadow.h
+++ xen-redhat/include/asm-ia64/shadow.h
@@ -40,8 +40,8 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-void guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -Naurp xen/include/asm-ia64/vmmu.h xen-redhat/include/asm-ia64/vmmu.h
--- xen/include/asm-ia64/vmmu.h
+++ xen-redhat/include/asm-ia64/vmmu.h
@@ -24,12 +24,8 @@
 #define XEN_TLBthash_H
 
 #define     MAX_CCN_DEPTH       (15)       // collision chain depth
-#define     VCPU_VTLB_SHIFT     (20)    // 1M for VTLB
-#define     VCPU_VTLB_SIZE      (1UL<<VCPU_VTLB_SHIFT)
-#define     VCPU_VTLB_ORDER     (VCPU_VTLB_SHIFT - PAGE_SHIFT)
-#define     VCPU_VHPT_SHIFT     (24)    // 16M for VTLB
-#define     VCPU_VHPT_SIZE      (1UL<<VCPU_VHPT_SHIFT)
-#define     VCPU_VHPT_ORDER     (VCPU_VHPT_SHIFT - PAGE_SHIFT)
+#define     DEFAULT_VTLB_SZ     (14) // 16K hash + 16K c-chain for VTLB
+#define     DEFAULT_VHPT_SZ     (23) // 8M hash + 8M c-chain for VHPT
 #define     VTLB(v,_x)          (v->arch.vtlb._x)
 #define     VHPT(v,_x)          (v->arch.vhpt._x)
 #ifndef __ASSEMBLY__
@@ -195,15 +191,17 @@ typedef struct thash_cb {
     u64     hash_sz;        // size of above data.
     void    *cch_buf;       // base address of collision chain.
     u64     cch_sz;         // size of above data.
+    u64     cch_free_idx;   // index of free entry.
     thash_data_t *cch_freelist;
-    thash_data_t *cch_rec_head;  // cch recycle header
     PTA     pta;
 } thash_cb_t;
 
 /*
- * Initialize internal control data before service.
+ * Allocate and initialize internal control data before service.
  */
-extern void thash_init(thash_cb_t *hcb, u64 sz);
+extern int thash_alloc(thash_cb_t *hcb, u64 sz, char *what);
+
+extern void thash_free(thash_cb_t *hcb);
 
 /*
  * Insert an entry to hash table. 
@@ -279,6 +277,7 @@ extern void thash_purge_and_insert(struc
  *
  */
 extern void thash_purge_all(struct vcpu *v);
+extern void vmx_vcpu_flush_vtlb_all(struct vcpu *v);
 
 /*
  * Lookup the hash table and its collision chain to find an entry
diff -Naurp xen/include/asm-ia64/vmx.h xen-redhat/include/asm-ia64/vmx.h
--- xen/include/asm-ia64/vmx.h
+++ xen-redhat/include/asm-ia64/vmx.h
@@ -22,6 +22,8 @@
 #ifndef _ASM_IA64_VT_H
 #define _ASM_IA64_VT_H
 
+#include <asm/ia64_int.h>
+
 #include <public/hvm/ioreq.h>
 #define vmx_user_mode(regs) (((struct ia64_psr *)&(regs)->cr_ipsr)->vm == 1)
 
@@ -36,7 +38,7 @@ extern void vmx_load_state(struct vcpu *
 extern void vmx_setup_platform(struct domain *d);
 extern void vmx_do_launch(struct vcpu *v);
 extern void vmx_io_assist(struct vcpu *v);
-extern int ia64_hypercall (struct pt_regs *regs);
+extern IA64FAULT ia64_hypercall(struct pt_regs *regs);
 extern void vmx_save_state(struct vcpu *v);
 extern void vmx_load_state(struct vcpu *v);
 extern void show_registers(struct pt_regs *regs);
@@ -50,12 +52,15 @@ extern void set_ifa_itir_iha (struct vcp
 extern void inject_guest_interruption(struct vcpu *vcpu, u64 vec);
 extern void set_illegal_op_isr (struct vcpu *vcpu);
 extern void illegal_op (struct vcpu *vcpu);
+extern void set_rsv_reg_field_isr (struct vcpu *vcpu);
+extern void rsv_reg_field (struct vcpu *vcpu);
 extern void vmx_relinquish_guest_resources(struct domain *d);
 extern void vmx_relinquish_vcpu_resources(struct vcpu *v);
 extern void vmx_die_if_kernel(char *str, struct pt_regs *regs, long err);
 extern void vmx_send_assist_req(struct vcpu *v);
 extern void deliver_pal_init(struct vcpu *vcpu);
 extern void vmx_pend_pal_init(struct domain *d);
+extern void vmx_lazy_load_fpu(struct vcpu *vcpu);
 
 static inline vcpu_iodata_t *get_vio(struct domain *d, unsigned long cpu)
 {
diff -Naurp xen/include/asm-ia64/vmx_pal_vsa.h xen-redhat/include/asm-ia64/vmx_pal_vsa.h
--- xen/include/asm-ia64/vmx_pal_vsa.h
+++ xen-redhat/include/asm-ia64/vmx_pal_vsa.h
@@ -28,6 +28,14 @@
 #ifndef __ASSEMBLY__
 extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
                          u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+
+/* entry points in assembly code for calling vps services */
+
+extern char vmx_vps_sync_read;
+extern char vmx_vps_sync_write;
+extern char vmx_vps_resume_normal;
+extern char vmx_vps_resume_handler;
+
 extern u64 __vsa_base;
 #endif  /* __ASSEMBLY__ */
 
@@ -38,6 +46,8 @@ extern u64 __vsa_base;
 #define PAL_VPS_SET_PENDING_INTERRUPT       0x1000
 #define PAL_VPS_THASH               0x1400
 #define PAL_VPS_TTAG                0x1800
+#define PAL_VPS_RESTORE             0x1c00
+#define PAL_VPS_SAVE                0x2000
 
 #endif /* _PAL_VSA_H_ */
 
diff -Naurp xen/include/asm-ia64/vmx_phy_mode.h xen-redhat/include/asm-ia64/vmx_phy_mode.h
--- xen/include/asm-ia64/vmx_phy_mode.h
+++ xen-redhat/include/asm-ia64/vmx_phy_mode.h
@@ -120,4 +120,6 @@ extern void physical_tlb_miss(VCPU *vcpu
 #define GUEST_VIRT  1   /* Guest in virtual mode */
 #define GUEST_PHYS  2   /* Guest in physical mode, requiring emulation */
 
+#define PAL_INIT_ENTRY 0x80000000ffffffa0
+
 #endif /* _PHY_MODE_H_ */
diff -Naurp xen/include/asm-ia64/vmx_platform.h xen-redhat/include/asm-ia64/vmx_platform.h
--- xen/include/asm-ia64/vmx_platform.h
+++ xen-redhat/include/asm-ia64/vmx_platform.h
@@ -22,6 +22,7 @@
 #include <public/xen.h>
 #include <public/hvm/params.h>
 #include <asm/viosapic.h>
+#include <asm/hvm/vacpi.h>
 struct mmio_list;
 typedef struct virtual_platform_def {
     unsigned long       buffered_io_va;
@@ -33,6 +34,7 @@ typedef struct virtual_platform_def {
     struct mmio_list    *mmio;
     /* One IOSAPIC now... */
     struct viosapic     viosapic;
+    struct vacpi        vacpi;
 } vir_plat_t;
 
 static inline int __fls(uint32_t word)
diff -Naurp xen/include/asm-ia64/vmx_vcpu.h xen-redhat/include/asm-ia64/vmx_vcpu.h
--- xen/include/asm-ia64/vmx_vcpu.h
+++ xen-redhat/include/asm-ia64/vmx_vcpu.h
@@ -331,34 +331,22 @@ static inline IA64FAULT vmx_vcpu_get_cpu
 
 static inline IA64FAULT vmx_vcpu_set_dbr(VCPU * vcpu, u64 reg, u64 val)
 {
-	// TODO: unimplemented DBRs return a reserved register fault
-	// TODO: Should set Logical CPU state, not just physical
-	ia64_set_dbr(reg, val);
-	return IA64_NO_FAULT;
+        return vcpu_set_dbr(vcpu, reg, val);
 }
 
 static inline IA64FAULT vmx_vcpu_set_ibr(VCPU * vcpu, u64 reg, u64 val)
 {
-	// TODO: unimplemented IBRs return a reserved register fault
-	// TODO: Should set Logical CPU state, not just physical
-	ia64_set_ibr(reg, val);
-	return IA64_NO_FAULT;
+        return vcpu_set_ibr(vcpu, reg, val);
 }
 
 static inline IA64FAULT vmx_vcpu_get_dbr(VCPU * vcpu, u64 reg, u64 * pval)
 {
-	// TODO: unimplemented DBRs return a reserved register fault
-	u64 val = ia64_get_dbr(reg);
-	*pval = val;
-	return IA64_NO_FAULT;
+        return vcpu_get_dbr(vcpu, reg, pval);
 }
 
 static inline IA64FAULT vmx_vcpu_get_ibr(VCPU * vcpu, u64 reg, u64 * pval)
 {
-	// TODO: unimplemented IBRs return a reserved register fault
-	u64 val = ia64_get_ibr(reg);
-	*pval = val;
-	return IA64_NO_FAULT;
+        return vcpu_get_ibr(vcpu, reg, pval);
 }
 
 /**************************************************************************
diff -Naurp xen/include/asm-ia64/xenprocessor.h xen-redhat/include/asm-ia64/xenprocessor.h
--- xen/include/asm-ia64/xenprocessor.h
+++ xen-redhat/include/asm-ia64/xenprocessor.h
@@ -237,6 +237,10 @@ typedef union {
     u64 itir;
 } ia64_itir_t;
 
-#define dump_execution_state() printk("FIXME: implement ia64 dump_execution_state()\n");
+#define dump_execution_state()						\
+    do {								\
+        printk("FIXME: implement ia64 dump_execution_state()\n");	\
+        dump_stack();							\
+    } while (0)
 
 #endif // _ASM_IA64_XENPROCESSOR_H
diff -Naurp xen/include/asm-x86/acpi.h xen-redhat/include/asm-x86/acpi.h
--- xen/include/asm-x86/acpi.h
+++ xen-redhat/include/asm-x86/acpi.h
@@ -178,4 +178,6 @@ extern void acpi_reserve_bootmem(void);
 extern u8 x86_acpiid_to_apicid[];
 #define MAX_LOCAL_APIC 256
 
+extern int acpi_dmar_init(void);
+
 #endif /*_ASM_ACPI_H*/
diff -Naurp xen/include/asm-x86/amd-iommu.h xen-redhat/include/asm-x86/amd-iommu.h
--- xen/include/asm-x86/amd-iommu.h
+++ xen-redhat/include/asm-x86/amd-iommu.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#ifndef _ASM_X86_64_AMD_IOMMU_H
+#define _ASM_X86_64_AMD_IOMMU_H
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/list.h>
+#include <xen/spinlock.h>
+#include <asm/hvm/svm/amd-iommu-defs.h>
+
+#define iommu_found()           (!list_empty(&amd_iommu_head))
+
+extern struct list_head amd_iommu_head;
+
+extern int __init amd_iov_detect(void);
+
+struct table_struct {
+    void *buffer;
+    unsigned long entries;
+    unsigned long alloc_size;
+};
+
+struct amd_iommu {
+    struct list_head list;
+    spinlock_t lock; /* protect iommu */
+
+    u16 bdf;
+    u8  cap_offset;
+    u8  revision;
+    u8  unit_id;
+    u8  msi_number;
+
+    u8 pte_not_present_cached;
+    u8 ht_tunnel_support;
+    u8 iotlb_support;
+
+    u8 isochronous;
+    u8 coherent;
+    u8 res_pass_pw;
+    u8 pass_pw;
+    u8 ht_tunnel_enable;
+
+    void *mmio_base;
+    unsigned long mmio_base_phys;
+
+    struct table_struct dev_table;
+    struct table_struct cmd_buffer;
+    u32 cmd_buffer_tail;
+    struct table_struct event_log;
+    u32 event_log_head;
+
+    int exclusion_enable;
+    int exclusion_allow_all;
+    uint64_t exclusion_base;
+    uint64_t exclusion_limit;
+
+    int msi_cap;
+    int maskbit;
+
+    int enabled;
+    int vector;
+};
+
+struct ivrs_mappings {
+    u16 dte_requestor_id;
+    u8 dte_sys_mgt_enable;
+    u8 dte_allow_exclusion;
+    u8 unity_map_enable;
+    u8 write_permission;
+    u8 read_permission;
+    unsigned long addr_range_start;
+    unsigned long addr_range_length;
+    struct amd_iommu *iommu;
+
+    /* per device interrupt remapping table */
+    void *intremap_table;
+    spinlock_t intremap_lock;
+
+    /* interrupt remapping settings */
+    u8 dte_lint1_pass;
+    u8 dte_lint0_pass;
+    u8 dte_nmi_pass;
+    u8 dte_ext_int_pass;
+    u8 dte_init_pass;
+};
+#endif /* _ASM_X86_64_AMD_IOMMU_H */
diff -Naurp xen/include/asm-x86/apic.h xen-redhat/include/asm-x86/apic.h
--- xen/include/asm-x86/apic.h
+++ xen-redhat/include/asm-x86/apic.h
@@ -2,9 +2,7 @@
 #define __ASM_APIC_H
 
 #include <xen/config.h>
-#include <asm/fixmap.h>
 #include <asm/apicdef.h>
-#include <asm/processor.h>
 #include <asm/system.h>
 
 #define Dprintk(x...)
@@ -51,11 +49,7 @@ static __inline u32 apic_read(unsigned l
 	return *((volatile u32 *)(APIC_BASE+reg));
 }
 
-static __inline__ void apic_wait_icr_idle(void)
-{
-	while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY )
-		cpu_relax();
-}
+void apic_wait_icr_idle(void);
 
 int get_physical_broadcast(void);
 
diff -Naurp xen/include/asm-x86/config.h xen-redhat/include/asm-x86/config.h
--- xen/include/asm-x86/config.h
+++ xen-redhat/include/asm-x86/config.h
@@ -382,4 +382,6 @@ extern unsigned long xen_phys_start, xen
 #define ELFSIZE 32
 #endif
 
+#define ARCH_CRASH_SAVE_VMCOREINFO
+
 #endif /* __X86_CONFIG_H__ */
diff -Naurp xen/include/asm-x86/cpufeature.h xen-redhat/include/asm-x86/cpufeature.h
--- xen/include/asm-x86/cpufeature.h
+++ xen-redhat/include/asm-x86/cpufeature.h
@@ -31,7 +31,7 @@
 #define X86_FEATURE_PSE36	(0*32+17) /* 36-bit PSEs */
 #define X86_FEATURE_PN		(0*32+18) /* Processor serial number */
 #define X86_FEATURE_CLFLSH	(0*32+19) /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DTES	(0*32+21) /* Debug Trace Store */
+#define X86_FEATURE_DS		(0*32+21) /* Debug Store */
 #define X86_FEATURE_ACPI	(0*32+22) /* ACPI via MSR */
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
 #define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
@@ -42,6 +42,7 @@
 #define X86_FEATURE_HT		(0*32+28) /* Hyper-Threading */
 #define X86_FEATURE_ACC		(0*32+29) /* Automatic clock control */
 #define X86_FEATURE_IA64	(0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE		(0*32+31) /* Pending Break Enable */
 
 /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
 /* Don't duplicate feature flags which are redundant with Intel! */
@@ -49,6 +50,8 @@
 #define X86_FEATURE_MP		(1*32+19) /* MP Capable. */
 #define X86_FEATURE_NX		(1*32+20) /* Execute Disable */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FFXSR	(1*32+25) /* FFXSR instruction optimizations */
+#define X86_FEATURE_PAGE1GB	(1*32+26) /* 1Gb large page support */
 #define X86_FEATURE_RDTSCP	(1*32+27) /* RDTSCP */
 #define X86_FEATURE_LM		(1*32+29) /* Long Mode (x86-64) */
 #define X86_FEATURE_3DNOWEXT	(1*32+30) /* AMD 3DNow! extensions */
@@ -71,29 +74,65 @@
 #define X86_FEATURE_P3		(3*32+ 6) /* P3 */
 #define X86_FEATURE_P4		(3*32+ 7) /* P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
+#define X86_FEATURE_DTES64	(4*32+ 2) /* 64-bit Debug Store */
 #define X86_FEATURE_MWAIT	(4*32+ 3) /* Monitor/Mwait support */
 #define X86_FEATURE_DSCPL	(4*32+ 4) /* CPL Qualified Debug Store */
 #define X86_FEATURE_VMXE	(4*32+ 5) /* Virtual Machine Extensions */
+#define X86_FEATURE_SMXE	(4*32+ 6) /* Safer Mode Extensions */
 #define X86_FEATURE_EST		(4*32+ 7) /* Enhanced SpeedStep */
 #define X86_FEATURE_TM2		(4*32+ 8) /* Thermal Monitor 2 */
 #define X86_FEATURE_CID		(4*32+10) /* Context ID */
+#define X86_FEATURE_FMA		(4*32+12) /* Fused multiply-add */
 #define X86_FEATURE_CX16        (4*32+13) /* CMPXCHG16B */
 #define X86_FEATURE_XTPR	(4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM	(4*32+15) /* Perf/Debug Capability MSR */
+#define X86_FEATURE_DCA		(4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX		(4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_HYPERVISOR	(4*32+31) /* Running under some hypervisor */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
 #define X86_FEATURE_XSTORE	(5*32+ 2) /* on-CPU RNG present (xstore insn) */
 #define X86_FEATURE_XSTORE_EN	(5*32+ 3) /* on-CPU RNG enabled */
 #define X86_FEATURE_XCRYPT	(5*32+ 6) /* on-CPU crypto (xcrypt insn) */
 #define X86_FEATURE_XCRYPT_EN	(5*32+ 7) /* on-CPU crypto enabled */
+#define X86_FEATURE_ACE2	(5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN	(5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE		(5*32+ 10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN	(5*32+ 11) /* PHE enabled */
+#define X86_FEATURE_PMM		(5*32+ 12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN	(5*32+ 13) /* PMM enabled */
+
 
 /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
 #define X86_FEATURE_LAHF_LM	(6*32+ 0) /* LAHF/SAHF in long mode */
 #define X86_FEATURE_CMP_LEGACY	(6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVME        (6*32+ 2) /* Secure Virtual Machine */
-#define X86_FEATURE_FFXSR       (6*32+25) /* FFXSR instruction optimizations */
+#define X86_FEATURE_SVM		(6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC	(6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY	(6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM		(6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A	(6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE	(6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW	(6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS		(6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP		(6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT	(6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT		(6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP		(6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4	(6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_NODEID_MSR	(6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM		(6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT	(6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE (6*32+ 23) /* core perf counter extensions */
+#define X86_FEATURE_PERFCTR_NB	(6*32+ 24) /* NB perf counter extensions */
 
 #define cpu_has(c, bit)		test_bit(bit, (c)->x86_capability)
 #define boot_cpu_has(bit)	test_bit(bit, boot_cpu_data.x86_capability)
@@ -105,6 +144,7 @@
 #define cpu_has_tsc		boot_cpu_has(X86_FEATURE_TSC)
 #define cpu_has_pae		boot_cpu_has(X86_FEATURE_PAE)
 #define cpu_has_pge		boot_cpu_has(X86_FEATURE_PGE)
+#define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
 #define cpu_has_apic		boot_cpu_has(X86_FEATURE_APIC)
 #define cpu_has_sep		boot_cpu_has(X86_FEATURE_SEP)
 #define cpu_has_mtrr		boot_cpu_has(X86_FEATURE_MTRR)
@@ -121,6 +161,8 @@
 #define cpu_has_cyrix_arr	boot_cpu_has(X86_FEATURE_CYRIX_ARR)
 #define cpu_has_centaur_mcr	boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
 #define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_page1gb		0
+#define cpu_has_efer            (boot_cpu_data.x86_capability[1] & 0x20100800)
 #else /* __x86_64__ */
 #define cpu_has_vme		0
 #define cpu_has_de		1
@@ -128,6 +170,7 @@
 #define cpu_has_tsc		1
 #define cpu_has_pae		1
 #define cpu_has_pge		1
+#define cpu_has_pat		1
 #define cpu_has_apic		boot_cpu_has(X86_FEATURE_APIC)
 #define cpu_has_sep		0
 #define cpu_has_mtrr		1
@@ -144,8 +187,12 @@
 #define cpu_has_cyrix_arr	0
 #define cpu_has_centaur_mcr	0
 #define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_page1gb		boot_cpu_has(X86_FEATURE_PAGE1GB)
+#define cpu_has_efer            1
 #endif
 
+#define cpu_has_arch_perfmon    boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
+
 #endif /* __ASM_I386_CPUFEATURE_H */
 
 /* 
diff -Naurp xen/include/asm-x86/debugger.h xen-redhat/include/asm-x86/debugger.h
--- xen/include/asm-x86/debugger.h
+++ xen-redhat/include/asm-x86/debugger.h
@@ -68,6 +68,8 @@ static inline int debugger_trap_entry(
     if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached &&
          ((vector == TRAP_int3) || (vector == TRAP_debug)) )
     {
+        if ( vector != TRAP_debug ) /* domain pause is good enough */
+            current->arch.gdbsx_vcpu_event = vector;
         domain_pause_for_debugger();
         return 1;
     }
diff -Naurp xen/include/asm-x86/desc.h xen-redhat/include/asm-x86/desc.h
--- xen/include/asm-x86/desc.h
+++ xen-redhat/include/asm-x86/desc.h
@@ -34,11 +34,9 @@
 #define FLAT_COMPAT_USER_CS   FLAT_COMPAT_RING3_CS
 #define FLAT_COMPAT_USER_SS   FLAT_COMPAT_RING3_SS
 
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2)
-
-#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 2)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2)
 
 #elif defined(__i386__)
 
@@ -49,19 +47,17 @@
 #define FLAT_COMPAT_USER_DS   FLAT_USER_DS
 #define FLAT_COMPAT_USER_SS   FLAT_USER_SS
 
-#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
-
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1)
+#define DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
 
-#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 1)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1)
 
 #endif
 
 #ifndef __ASSEMBLY__
 
-#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (__TSS(n)<<3) )
+#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (TSS_ENTRY<<3) )
 
 #if defined(__x86_64__)
 #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
@@ -194,20 +190,25 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\
  "rorl $16,%%eax" \
  : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type|0x80))
 
+DECLARE_PER_CPU(struct tss_struct *, doublefault_tss);
+
 #endif
 
-extern struct desc_struct gdt_table[];
+struct desc_ptr {
+	unsigned short limit;
+	unsigned long base;
+} __attribute__((__packed__)) ;
+
+extern struct desc_struct boot_cpu_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, gdt_table);
 #ifdef CONFIG_COMPAT
-extern struct desc_struct compat_gdt_table[];
+extern struct desc_struct boot_cpu_compat_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
 #else
-# define compat_gdt_table gdt_table
+# define boot_cpu_compat_gdt_table boot_cpu_gdt_table
+# define per_cpu__compat_gdt_table per_cpu__gdt_table
 #endif
 
-struct Xgt_desc_struct {
-    unsigned short size;
-    unsigned long address __attribute__((packed));
-};
-
 extern void set_intr_gate(unsigned int irq, void * addr);
 extern void set_system_gate(unsigned int n, void *addr);
 extern void set_task_gate(unsigned int n, unsigned int sel);
diff -Naurp xen/include/asm-x86/domain.h xen-redhat/include/asm-x86/domain.h
--- xen/include/asm-x86/domain.h
+++ xen-redhat/include/asm-x86/domain.h
@@ -16,7 +16,6 @@
 #define is_pv_32on64_domain(d) (0)
 #endif
 #define is_pv_32on64_vcpu(v)   (is_pv_32on64_domain((v)->domain))
-#define IS_COMPAT(d)           (is_pv_32on64_domain(d))
 
 struct trap_bounce {
     uint32_t      error_code;
@@ -139,6 +138,13 @@ struct p2m_domain {
     struct page_info * (*alloc_page  )(struct domain *d);
     void               (*free_page   )(struct domain *d, 
                                        struct page_info *pg);
+    int                (*set_entry   )(struct domain *d, unsigned long gfn,
+                                       mfn_t mfn, int order, u32 l1e_flags);
+    mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn);
+    mfn_t              (*get_entry_fast)(unsigned long gfn);
+
+    void               (*change_entry_type_global)(struct domain *d,
+                                                    u32 l1e_flags);
 
     /* Highest guest frame that's ever been mapped in the p2m */
     unsigned long max_mapped_pfn;
@@ -212,15 +218,22 @@ struct arch_domain
 
     /* I/O-port admin-specified access capabilities. */
     struct rangeset *ioport_caps;
+    uint32_t pci_cf8;
 
     struct hvm_domain hvm_domain;
 
+    /* pass-throughed device list */
+    struct list_head pdev_list;
+
     struct paging_domain paging;
     struct p2m_domain p2m ;
 
     /* Shadow translated domain: P2M mapping */
     pagetable_t phys_table;
 
+    int vector_pirq[NR_VECTORS];
+    int pirq_vector[NR_IRQS];
+
     /* Pseudophysical e820 map (XENMEM_memory_map).  */
     struct e820entry e820[3];
     unsigned int nr_e820;
@@ -232,6 +245,17 @@ struct arch_domain
     bool_t is_32bit_pv;
     /* Is shared-info page in 32-bit format? */
     bool_t has_32bit_shinfo;
+
+    /* Continuable domain_relinquish_resources(). */
+    enum {
+        RELMEM_not_started,
+        RELMEM_xen,
+        RELMEM_l4,
+        RELMEM_l3,
+        RELMEM_l2,
+        RELMEM_done,
+    } relmem;
+    struct list_head relmem_list;
 } __cacheline_aligned;
 
 #ifdef CONFIG_X86_PAE
@@ -268,6 +292,9 @@ struct arch_vcpu
     void (*ctxt_switch_from) (struct vcpu *);
     void (*ctxt_switch_to) (struct vcpu *);
 
+    /* Record information required to continue execution after migration */
+    void *continue_info;
+
     /* Bounce information for propagating an exception to guest OS. */
     struct trap_bounce trap_bounce;
 
@@ -309,12 +336,27 @@ struct arch_vcpu
 
     /* Guest-specified relocation of vcpu_info. */
     unsigned long vcpu_info_mfn;
+
+    uint32_t gdbsx_vcpu_event;
 } __cacheline_aligned;
 
-/* shorthands to improve code legibility */
+/* Shorthands to improve code legibility. */
 #define hvm_vmx         hvm_vcpu.u.vmx
 #define hvm_svm         hvm_vcpu.u.svm
 
+/* Continue the current hypercall via func(data) on specified cpu. */
+int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
+
+/* Clean up CR4 bits that are not under guest control. */
+ unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4);
+
+/* Convert between guest-visible and real CR4 values. */
+#define pv_guest_cr4_to_real_cr4(c) \
+    ((c) | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE)))
+#define real_cr4_to_pv_guest_cr4(c) \
+    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE))
+
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff -Naurp xen/include/asm-x86/e820.h xen-redhat/include/asm-x86/e820.h
--- xen/include/asm-x86/e820.h
+++ xen-redhat/include/asm-x86/e820.h
@@ -22,6 +22,7 @@ struct e820map {
     struct e820entry map[E820MAX];
 };
 
+extern int reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e);
 extern unsigned long init_e820(const char *, struct e820entry *, int *);
 extern struct e820map e820;
 
diff -Naurp xen/include/asm-x86/event.h xen-redhat/include/asm-x86/event.h
--- xen/include/asm-x86/event.h
+++ xen-redhat/include/asm-x86/event.h
@@ -10,7 +10,6 @@
 #define __ASM_EVENT_H__
 
 #include <xen/shared.h>
-#include <asm/hvm/irq.h> /* cpu_has_pending_irq() */
 
 static inline void vcpu_kick(struct vcpu *v)
 {
@@ -31,7 +30,12 @@ static inline void vcpu_kick(struct vcpu
 
 static inline void vcpu_mark_events_pending(struct vcpu *v)
 {
-    if ( !test_and_set_bit(0, &vcpu_info(v, evtchn_upcall_pending)) )
+    if ( test_and_set_bit(0, &vcpu_info(v, evtchn_upcall_pending)) )
+        return;
+
+    if ( is_hvm_vcpu(v) )
+        hvm_assert_evtchn_irq(v);
+    else
         vcpu_kick(v);
 }
 
diff -Naurp xen/include/asm-x86/fixmap.h xen-redhat/include/asm-x86/fixmap.h
--- xen/include/asm-x86/fixmap.h
+++ xen-redhat/include/asm-x86/fixmap.h
@@ -17,6 +17,9 @@
 #include <asm/acpi.h>
 #include <asm/page.h>
 #include <xen/kexec.h>
+#include <xen/iommu.h>
+#include <asm/msi.h>
+#include <asm/amd-iommu.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -40,6 +43,12 @@ enum fixed_addresses {
     FIX_KEXEC_BASE_0,
     FIX_KEXEC_BASE_END = FIX_KEXEC_BASE_0 \
       + ((KEXEC_XEN_NO_PAGES >> 1) * KEXEC_IMAGE_NR) - 1,
+    FIX_IOMMU_REGS_BASE_0,
+    FIX_IOMMU_REGS_END = FIX_IOMMU_REGS_BASE_0 + MAX_IOMMUS-1,
+    FIX_IOMMU_MMIO_BASE_0,
+    FIX_IOMMU_MMIO_END = FIX_IOMMU_MMIO_BASE_0 + IOMMU_PAGES-1,
+    FIX_MSIX_IO_RESERV_BASE,
+    FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + FIX_MSIX_MAX_PAGES -1,
     __end_of_fixed_addresses
 };
 
diff -Naurp xen/include/asm-x86/flushtlb.h xen-redhat/include/asm-x86/flushtlb.h
--- xen/include/asm-x86/flushtlb.h
+++ xen-redhat/include/asm-x86/flushtlb.h
@@ -74,12 +74,17 @@ extern void write_cr3(unsigned long cr3)
 /* Flush guest mappings from the TLB and implicitly tick the tlbflush clock. */
 extern void local_flush_tlb(void);
 
+#ifdef USER_MAPPINGS_ARE_GLOBAL
+#define local_flush_tlb_pge() local_flush_tlb()
+#else
 #define local_flush_tlb_pge()                                     \
     do {                                                          \
-        __pge_off();                                              \
+        unsigned long cr4 = read_cr4();                           \
+        write_cr4(cr4 & ~X86_CR4_PGE);                            \
         local_flush_tlb();                                        \
-        __pge_on();                                               \
+        write_cr4(cr4);                                           \
     } while ( 0 )
+#endif
 
 #define local_flush_tlb_one(__addr) \
     __asm__ __volatile__("invlpg %0": :"m" (*(char *) (__addr)))
diff -Naurp xen/include/asm-x86/guest_access.h xen-redhat/include/asm-x86/guest_access.h
--- xen/include/asm-x86/guest_access.h
+++ xen-redhat/include/asm-x86/guest_access.h
@@ -17,7 +17,8 @@
 
 /* Offset the given guest handle into the array it refers to. */
 #define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr))
-
+#define guest_handle_subtract_offset(hnd, nr) ((hnd).p -= (nr))
+ 
 /* Cast a guest handle to the specified type of handle. */
 #define guest_handle_cast(hnd, type) ({         \
     type *_x = (hnd).p;                         \
diff -Naurp xen/include/asm-x86/hvm/domain.h xen-redhat/include/asm-x86/hvm/domain.h
--- xen/include/asm-x86/hvm/domain.h
+++ xen-redhat/include/asm-x86/hvm/domain.h
@@ -27,6 +27,7 @@
 #include <asm/hvm/io.h>
 #include <public/hvm/params.h>
 #include <public/hvm/save.h>
+#include <xen/hvm/iommu.h>
 
 struct hvm_ioreq_page {
     spinlock_t lock;
@@ -45,6 +46,7 @@ struct hvm_domain {
     spinlock_t             vapic_access_lock;
     int                    physmap_changed_for_vlapic_access : 1;
     struct page_info       *apic_access_page;
+    unsigned long          vmx_apic_access_mfn;
 
     struct hvm_io_handler  io_handler;
 
@@ -54,12 +56,28 @@ struct hvm_domain {
     struct hvm_hw_vpic     vpic[2]; /* 0=master; 1=slave */
     struct hvm_hw_vioapic  vioapic;
 
+    /* VCPU which is current target for 8259 interrupts. */
+    struct vcpu           *i8259_target;
+
     /* hvm_print_line() logging. */
     char                   pbuf[80];
     int                    pbuf_idx;
     spinlock_t             pbuf_lock;
 
     uint64_t               params[HVM_NR_PARAMS];
+
+    /* Pass-through */
+    struct hvm_iommu       hvm_iommu;
+
+    /* hypervisor intercepted msix table */
+    struct list_head       msixtbl_list;
+    spinlock_t             msixtbl_list_lock;
+
+#if CONFIG_PAGING_LEVELS == 3
+    bool_t                 amd_npt_4gb_warning;
+#endif
+
+    unsigned long           vmx_vpid_base;
 };
 
 #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff -Naurp xen/include/asm-x86/hvm/hvm.h xen-redhat/include/asm-x86/hvm/hvm.h
--- xen/include/asm-x86/hvm/hvm.h
+++ xen-redhat/include/asm-x86/hvm/hvm.h
@@ -55,6 +55,14 @@ typedef struct segment_register {
     u64        base;
 } __attribute__ ((packed)) segment_register_t;
 
+/* Interrupt acknowledgement sources. */
+enum hvm_intack {
+    hvm_intack_none,
+    hvm_intack_pic,
+    hvm_intack_lapic,
+    hvm_intack_nmi
+};
+
 /*
  * The hardware virtual machine (HVM) interface abstracts away from the
  * x86/x86_64 CPU virtualization assist specifics. Currently this interface
@@ -63,14 +71,22 @@ typedef struct segment_register {
 struct hvm_function_table {
     char *name;
 
+    /* Support Hardware-Assisted Paging? */
+    int hap_supported;
+
+    /* Support 1GB host page table? */
+    int hap_1gb_pgtb;
+
     /*
      *  Disable HVM functionality
      */
     void (*disable)(void);
 
     /*
-     * Initialise/destroy HVM VCPU resources
+     * Initialise/destroy HVM domain/vcpu resources
      */
+    int  (*domain_initialise)(struct domain *d);
+    void (*domain_destroy)(struct domain *d);
     int  (*vcpu_initialise)(struct vcpu *v);
     void (*vcpu_destroy)(struct vcpu *v);
 
@@ -104,12 +120,14 @@ struct hvm_function_table {
     int (*long_mode_enabled)(struct vcpu *v);
     int (*pae_enabled)(struct vcpu *v);
     int (*nx_enabled)(struct vcpu *v);
-    int (*interrupts_enabled)(struct vcpu *v);
+    int (*interrupts_enabled)(struct vcpu *v, enum hvm_intack);
     int (*guest_x86_mode)(struct vcpu *v);
     unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
     unsigned long (*get_segment_base)(struct vcpu *v, enum x86_segment seg);
     void (*get_segment_register)(struct vcpu *v, enum x86_segment seg,
                                  struct segment_register *reg);
+    void (*set_segment_register)(struct vcpu *v, enum x86_segment seg,
+                                 struct segment_register *reg);
 
     /* 
      * Re-set the value of CR3 that Xen runs on when handling VM exits
@@ -122,6 +140,12 @@ struct hvm_function_table {
     void (*update_guest_cr3)(struct vcpu *v);
 
     /*
+     * Called to inform the HVM layer that the guest loaded cr3, and setup
+     * page tables accordingly.  Operates on the current VCPU.
+     */
+    int (*set_cr3)(unsigned long value);
+
+    /*
      * Called to ensure than all guest-specific mappings in a tagged TLB
      * are flushed; does *not* flush Xen's TLB entries, and on
      * processors without a tagged TLB it will be a noop.
@@ -149,7 +173,9 @@ struct hvm_function_table {
 
     void (*init_hypercall_page)(struct domain *d, void *hypercall_page);
 
-    int  (*event_injection_faulted)(struct vcpu *v);
+    int  (*event_pending)(struct vcpu *v);
+
+    void (*update_guest_cr)(struct vcpu *v, unsigned int cr);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -178,7 +204,11 @@ hvm_load_cpu_guest_regs(struct vcpu *v, 
     hvm_funcs.load_cpu_guest_regs(v, r);
 }
 
-void hvm_set_guest_time(struct vcpu *v, u64 gtime);
+void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc);
+u64 hvm_get_guest_tsc(struct vcpu *v);
+
+void hvm_init_guest_time(struct domain *d);
+void hvm_set_guest_time(struct vcpu *v, u64 guest_time);
 u64 hvm_get_guest_time(struct vcpu *v);
 
 static inline int
@@ -197,16 +227,16 @@ hvm_long_mode_enabled(struct vcpu *v)
 #define hvm_long_mode_enabled(v) (v,0)
 #endif
 
- static inline int
+static inline int
 hvm_pae_enabled(struct vcpu *v)
 {
     return hvm_funcs.pae_enabled(v);
 }
 
 static inline int
-hvm_interrupts_enabled(struct vcpu *v)
+hvm_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
 {
-    return hvm_funcs.interrupts_enabled(v);
+    return hvm_funcs.interrupts_enabled(v, type);
 }
 
 static inline int
@@ -238,6 +268,12 @@ hvm_update_vtpr(struct vcpu *v, unsigned
 
 void hvm_update_guest_cr3(struct vcpu *v, unsigned long guest_cr3);
 
+static inline void hvm_update_guest_cr(struct vcpu *v, unsigned int cr)
+{
+    if ( hvm_funcs.update_guest_cr )
+        hvm_funcs.update_guest_cr(v, cr);
+}
+
 static inline void 
 hvm_flush_guest_tlbs(void)
 {
@@ -267,6 +303,19 @@ hvm_get_segment_register(struct vcpu *v,
     hvm_funcs.get_segment_register(v, seg, reg);
 }
 
+static inline void
+hvm_set_segment_register(struct vcpu *v, enum x86_segment seg,
+                         struct segment_register *reg)
+{
+    hvm_funcs.set_segment_register(v, seg, reg);
+}
+
+static inline int
+hvm_set_cr3(unsigned long value)
+{
+    return hvm_funcs.set_cr3(value);
+}
+
 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
                                    unsigned int *ecx, unsigned int *edx);
 void hvm_stts(struct vcpu *v);
@@ -288,9 +337,9 @@ hvm_inject_exception(unsigned int trapnr
 
 int hvm_bringup_ap(int vcpuid, int trampoline_vector);
 
-static inline int hvm_event_injection_faulted(struct vcpu *v)
+static inline int hvm_event_pending(struct vcpu *v)
 {
-    return hvm_funcs.event_injection_faulted(v);
+    return hvm_funcs.event_pending(v);
 }
 
 /* These reserved bits in lower 32 remain 0 after any load of CR0 */
@@ -315,4 +364,23 @@ static inline int hvm_event_injection_fa
 /* These exceptions must always be intercepted. */
 #define HVM_TRAP_MASK (1U << TRAP_machine_check)
 
+#define HVM_IDENT_PT_PAGE 0xE8000
+
+enum hvm_task_switch_reason { TSW_jmp, TSW_iret, TSW_call_or_int };
+void hvm_task_switch(
+    uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
+    int32_t errcode);
+
+enum hvm_access_type {
+    hvm_access_insn_fetch, hvm_access_read, hvm_access_write
+};
+int hvm_virtual_to_linear_addr(
+    enum x86_segment seg,
+    struct segment_register *reg,
+    unsigned long offset,
+    unsigned int bytes,
+    enum hvm_access_type access_type,
+    unsigned int addr_size,
+    unsigned long *linear_addr);
+
 #endif /* __ASM_X86_HVM_HVM_H__ */
diff -Naurp xen/include/asm-x86/hvm/io.h xen-redhat/include/asm-x86/hvm/io.h
--- xen/include/asm-x86/hvm/io.h
+++ xen-redhat/include/asm-x86/hvm/io.h
@@ -86,14 +86,14 @@ struct hvm_io_op {
 #define HVM_MMIO                    1
 
 typedef int (*intercept_action_t)(ioreq_t *);
-typedef unsigned long (*hvm_mmio_read_t)(struct vcpu *v,
-                                         unsigned long addr,
-                                         unsigned long length);
-
-typedef void (*hvm_mmio_write_t)(struct vcpu *v,
+typedef int (*hvm_mmio_read_t)(struct vcpu *v,
                                unsigned long addr,
                                unsigned long length,
-                               unsigned long val);
+                               unsigned long *val);
+typedef int (*hvm_mmio_write_t)(struct vcpu *v,
+                                unsigned long addr,
+                                unsigned long length,
+                                unsigned long val);
 
 typedef int (*hvm_mmio_check_t)(struct vcpu *v, unsigned long addr);
 
@@ -137,13 +137,6 @@ static inline int register_portio_handle
     return register_io_handler(d, addr, size, action, HVM_PORTIO);
 }
 
-#if defined(__i386__) || defined(__x86_64__)
-static inline int irq_masked(unsigned long eflags)
-{
-    return ((eflags & X86_EFLAGS_IF) == 0);
-}
-#endif
-
 extern void send_pio_req(unsigned long port, unsigned long count, int size,
                          paddr_t value, int dir, int df, int value_is_ptr);
 void send_timeoffset_req(unsigned long timeoff);
@@ -151,6 +144,9 @@ void send_invalidate_req(void);
 extern void handle_mmio(unsigned long gpa);
 extern void hvm_interrupt_post(struct vcpu *v, int vector, int type);
 extern void hvm_io_assist(void);
+void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq,
+                  union vioapic_redir_entry *ent);
 
+extern void hvm_dpci_msi_eoi(struct domain *d, int vector);
 #endif /* __ASM_X86_HVM_IO_H__ */
 
diff -Naurp xen/include/asm-x86/hvm/iommu.h xen-redhat/include/asm-x86/hvm/iommu.h
--- xen/include/asm-x86/hvm/iommu.h
+++ xen-redhat/include/asm-x86/hvm/iommu.h
@@ -0,0 +1,40 @@
+#ifndef __ASM_X86_HVM_IOMMU_H__
+#define __ASM_X86_HVM_IOMMU_H__
+
+struct iommu_ops;
+extern struct iommu_ops intel_iommu_ops;
+extern struct iommu_ops amd_iommu_ops;
+extern int intel_vtd_setup(void);
+extern int amd_iov_detect(void);
+
+static inline struct iommu_ops *iommu_get_ops(void)
+{
+    switch ( boot_cpu_data.x86_vendor )
+    {
+    case X86_VENDOR_INTEL:
+        return &intel_iommu_ops;
+    case X86_VENDOR_AMD:
+        return &amd_iommu_ops;
+    default:
+        BUG();
+    }
+
+    return NULL;
+}
+
+static inline int iommu_hardware_setup(void)
+{
+    switch ( boot_cpu_data.x86_vendor )
+    {
+    case X86_VENDOR_INTEL:
+        return intel_vtd_setup();
+    case X86_VENDOR_AMD:
+        return amd_iov_detect();
+    default:
+        BUG();
+    }
+
+    return 0;
+}
+
+#endif /* __ASM_X86_HVM_IOMMU_H__ */
diff -Naurp xen/include/asm-x86/hvm/irq.h xen-redhat/include/asm-x86/hvm/irq.h
--- xen/include/asm-x86/hvm/irq.h
+++ xen-redhat/include/asm-x86/hvm/irq.h
@@ -24,10 +24,12 @@
 
 #include <xen/types.h>
 #include <xen/spinlock.h>
+#include <asm/irq.h>
+#include <asm/hvm/hvm.h>
 #include <asm/hvm/vpic.h>
 #include <asm/hvm/vioapic.h>
 #include <public/hvm/save.h>
-
+#include <xen/hvm/irq.h>
 
 struct hvm_irq {
     /*
@@ -89,6 +91,8 @@ struct hvm_irq {
 
     /* Last VCPU that was delivered a LowestPrio interrupt. */
     u8 round_robin_prev_vcpu;
+
+    struct hvm_irq_dpci *dpci;
 };
 
 #define hvm_pci_intx_gsi(dev, intx)  \
@@ -112,12 +116,23 @@ void hvm_isa_irq_deassert(
 
 void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
 
-void hvm_set_callback_irq_level(void);
+void hvm_maybe_deassert_evtchn_irq(void);
+void hvm_assert_evtchn_irq(struct vcpu *v);
 void hvm_set_callback_via(struct domain *d, uint64_t via);
 
-int cpu_get_interrupt(struct vcpu *v, int *type);
-int cpu_has_pending_irq(struct vcpu *v);
-int get_isa_irq_vector(struct vcpu *vcpu, int irq, int type);
+/* Check/Acknowledge next pending interrupt. */
+enum hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v);
+int hvm_vcpu_ack_pending_irq(
+    struct vcpu *v, enum hvm_intack type, int *vector);
+
+int get_isa_irq_vector(struct vcpu *vcpu, int isa_irq, enum hvm_intack src);
 int is_isa_irq_masked(struct vcpu *v, int isa_irq);
 
+/*
+ * Currently IA64 Xen doesn't support MSI. So for x86, we define this macro
+ * to control the conditional compilation of some MSI-related functions.
+ * This macro will be removed once IA64 has MSI support.
+ */
+#define SUPPORT_MSI_REMAPPING 1
+
 #endif /* __ASM_X86_HVM_IRQ_H__ */
diff -Naurp xen/include/asm-x86/hvm/support.h xen-redhat/include/asm-x86/hvm/support.h
--- xen/include/asm-x86/hvm/support.h
+++ xen-redhat/include/asm-x86/hvm/support.h
@@ -210,6 +210,8 @@ __initcall(__hvm_register_##_x##_save_an
 /* Entry points for saving and restoring HVM domain state */
 size_t hvm_save_size(struct domain *d);
 int hvm_save(struct domain *d, hvm_domain_context_t *h);
+int hvm_save_one(struct domain *d,  uint16_t typecode, uint16_t instance, 
+                 XEN_GUEST_HANDLE_64(uint8_t) handle);
 int hvm_load(struct domain *d, hvm_domain_context_t *h);
 
 /* End of save/restore */
diff -Naurp xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h xen-redhat/include/asm-x86/hvm/svm/amd-iommu-acpi.h
--- xen/include/asm-x86/hvm/svm/amd-iommu-acpi.h
+++ xen-redhat/include/asm-x86/hvm/svm/amd-iommu-acpi.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_64_AMD_IOMMU_ACPI_H
+#define _ASM_X86_64_AMD_IOMMU_ACPI_H
+
+#include <xen/acpi.h>
+
+/* I/O Virtualization Reporting Structure */
+#define AMD_IOMMU_ACPI_IVRS_SIG            "IVRS"
+#define AMD_IOMMU_ACPI_IVHD_TYPE       0x10
+#define AMD_IOMMU_ACPI_IVMD_ALL_TYPE       0x20
+#define AMD_IOMMU_ACPI_IVMD_ONE_TYPE       0x21
+#define AMD_IOMMU_ACPI_IVMD_RANGE_TYPE     0x22
+#define AMD_IOMMU_ACPI_IVMD_IOMMU_TYPE     0x23
+
+/* 4-byte Device Entries */
+#define AMD_IOMMU_ACPI_IVHD_DEV_U32_PAD        0
+#define AMD_IOMMU_ACPI_IVHD_DEV_SELECT     2
+#define AMD_IOMMU_ACPI_IVHD_DEV_RANGE_START    3
+#define AMD_IOMMU_ACPI_IVHD_DEV_RANGE_END  4
+
+/* 8-byte Device Entries */
+#define AMD_IOMMU_ACPI_IVHD_DEV_U64_PAD        64
+#define AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_SELECT   66
+#define AMD_IOMMU_ACPI_IVHD_DEV_ALIAS_RANGE    67
+#define AMD_IOMMU_ACPI_IVHD_DEV_EXT_SELECT 70
+#define AMD_IOMMU_ACPI_IVHD_DEV_EXT_RANGE  71
+#define AMD_IOMMU_ACPI_IVHD_DEV_SPECIAL    72
+
+/* IVHD IOMMU Flags */
+#define AMD_IOMMU_ACPI_COHERENT_MASK       0x20
+#define AMD_IOMMU_ACPI_COHERENT_SHIFT      5
+#define AMD_IOMMU_ACPI_IOTLB_SUP_MASK      0x10
+#define AMD_IOMMU_ACPI_IOTLB_SUP_SHIFT     4
+#define AMD_IOMMU_ACPI_ISOC_MASK       0x08
+#define AMD_IOMMU_ACPI_ISOC_SHIFT      3
+#define AMD_IOMMU_ACPI_RES_PASS_PW_MASK        0x04
+#define AMD_IOMMU_ACPI_RES_PASS_PW_SHIFT   2
+#define AMD_IOMMU_ACPI_PASS_PW_MASK        0x02
+#define AMD_IOMMU_ACPI_PASS_PW_SHIFT       1
+#define AMD_IOMMU_ACPI_HT_TUN_ENB_MASK     0x01
+#define AMD_IOMMU_ACPI_HT_TUN_ENB_SHIFT        0
+
+/* IVHD Device Flags */
+#define AMD_IOMMU_ACPI_LINT1_PASS_MASK     0x80
+#define AMD_IOMMU_ACPI_LINT1_PASS_SHIFT        7
+#define AMD_IOMMU_ACPI_LINT0_PASS_MASK     0x40
+#define AMD_IOMMU_ACPI_LINT0_PASS_SHIFT        6
+#define AMD_IOMMU_ACPI_SYS_MGT_MASK        0x30
+#define AMD_IOMMU_ACPI_SYS_MGT_SHIFT       4
+#define AMD_IOMMU_ACPI_NMI_PASS_MASK       0x04
+#define AMD_IOMMU_ACPI_NMI_PASS_SHIFT      2
+#define AMD_IOMMU_ACPI_EINT_PASS_MASK      0x02
+#define AMD_IOMMU_ACPI_EINT_PASS_SHIFT     1
+#define AMD_IOMMU_ACPI_INIT_PASS_MASK      0x01
+#define AMD_IOMMU_ACPI_INIT_PASS_SHIFT     0
+
+/* IVHD Device Extended Flags */
+#define AMD_IOMMU_ACPI_ATS_DISABLED_MASK   0x80000000
+#define AMD_IOMMU_ACPI_ATS_DISABLED_SHIFT  31
+
+/* IVMD Device Flags */
+#define AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK    0x08
+#define AMD_IOMMU_ACPI_EXCLUSION_RANGE_SHIFT   3
+#define AMD_IOMMU_ACPI_IW_PERMISSION_MASK  0x04
+#define AMD_IOMMU_ACPI_IW_PERMISSION_SHIFT 2
+#define AMD_IOMMU_ACPI_IR_PERMISSION_MASK  0x02
+#define AMD_IOMMU_ACPI_IR_PERMISSION_SHIFT 1
+#define AMD_IOMMU_ACPI_UNITY_MAPPING_MASK  0x01
+#define AMD_IOMMU_ACPI_UNITY_MAPPING_SHIFT 0
+
+#define ACPI_OEM_ID_SIZE                6
+#define ACPI_OEM_TABLE_ID_SIZE          8
+
+#pragma pack(1)
+struct acpi_ivrs_table_header {
+   struct acpi_table_header acpi_header;
+   u32 io_info;
+   u8  reserved[8];
+};
+
+struct acpi_ivrs_block_header {
+   u8  type;
+   u8  flags;
+   u16 length;
+   u16 dev_id;
+};
+
+struct acpi_ivhd_block_header {
+   struct acpi_ivrs_block_header header;
+   u16 cap_offset;
+   u64 mmio_base;
+   u16 pci_segment;
+   u16 iommu_info;
+   u8 reserved[4];
+};
+
+struct acpi_ivhd_device_header {
+   u8  type;
+   u16 dev_id;
+   u8  flags;
+};
+
+struct acpi_ivhd_device_trailer {
+   u8  type;
+   u16 dev_id;
+   u8  reserved;
+};
+
+struct acpi_ivhd_device_range {
+   struct acpi_ivhd_device_header header;
+   struct acpi_ivhd_device_trailer trailer;
+};
+
+struct acpi_ivhd_device_alias {
+   struct acpi_ivhd_device_header header;
+   u8  reserved1;
+   u16 dev_id;
+   u8  reserved2;
+};
+
+struct acpi_ivhd_device_alias_range {
+   struct acpi_ivhd_device_alias alias;
+   struct acpi_ivhd_device_trailer trailer;
+};
+
+struct acpi_ivhd_device_extended {
+   struct acpi_ivhd_device_header header;
+   u32 ext_flags;
+};
+
+struct acpi_ivhd_device_extended_range {
+   struct acpi_ivhd_device_extended extended;
+   struct acpi_ivhd_device_trailer trailer;
+};
+
+struct acpi_ivhd_device_special {
+   struct acpi_ivhd_device_header header;
+   u8  handle;
+   u16 dev_id;
+   u8  variety;
+};
+
+union acpi_ivhd_device {
+   struct acpi_ivhd_device_header header;
+   struct acpi_ivhd_device_range range;
+   struct acpi_ivhd_device_alias alias;
+   struct acpi_ivhd_device_alias_range alias_range;
+   struct acpi_ivhd_device_extended extended;
+   struct acpi_ivhd_device_extended_range extended_range;
+   struct acpi_ivhd_device_special special;
+};
+
+struct acpi_ivmd_block_header {
+   struct acpi_ivrs_block_header header;
+   union {
+       u16 last_dev_id;
+       u16 cap_offset;
+       u16 reserved1;
+   };
+   u64 reserved2;
+   u64 start_addr;
+   u64 mem_length;
+};
+#pragma pack()
+
+#endif /* _ASM_X86_64_AMD_IOMMU_ACPI_H */
diff -Naurp xen/include/asm-x86/hvm/svm/amd-iommu-defs.h xen-redhat/include/asm-x86/hvm/svm/amd-iommu-defs.h
--- xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ xen-redhat/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_64_AMD_IOMMU_DEFS_H
+#define _ASM_X86_64_AMD_IOMMU_DEFS_H
+
+/* IOMMU Command Buffer entries: in power of 2 increments, minimum of 256 */
+#define IOMMU_CMD_BUFFER_DEFAULT_ENTRIES	512
+
+/* IOMMU Event Log entries: in power of 2 increments, minimum of 256 */
+#define IOMMU_EVENT_LOG_DEFAULT_ENTRIES     512
+
+#define PTE_PER_TABLE_SHIFT		9
+#define PTE_PER_TABLE_SIZE		(1 << PTE_PER_TABLE_SHIFT)
+#define PTE_PER_TABLE_MASK		(~(PTE_PER_TABLE_SIZE - 1))
+#define PTE_PER_TABLE_ALIGN(entries) 	\
+	(((entries) + PTE_PER_TABLE_SIZE - 1) & PTE_PER_TABLE_MASK)
+#define PTE_PER_TABLE_ALLOC(entries)	\
+	PAGE_SIZE * (PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT)
+
+#define PCI_MIN_CAP_OFFSET	0x40
+#define PCI_MAX_CAP_BLOCKS	48
+#define PCI_CAP_PTR_MASK	0xFC
+
+/* IOMMU Capability */
+#define PCI_CAP_ID_MASK		0x000000FF
+#define PCI_CAP_ID_SHIFT	0
+#define PCI_CAP_NEXT_PTR_MASK	0x0000FF00
+#define PCI_CAP_NEXT_PTR_SHIFT	8
+#define PCI_CAP_TYPE_MASK	0x00070000
+#define PCI_CAP_TYPE_SHIFT	16
+#define PCI_CAP_REV_MASK	0x00F80000
+#define PCI_CAP_REV_SHIFT	19
+#define PCI_CAP_IOTLB_MASK	0x01000000
+#define PCI_CAP_IOTLB_SHIFT	24
+#define PCI_CAP_HT_TUNNEL_MASK	0x02000000
+#define PCI_CAP_HT_TUNNEL_SHIFT	25
+#define PCI_CAP_NP_CACHE_MASK	0x04000000
+#define PCI_CAP_NP_CACHE_SHIFT	26
+#define PCI_CAP_RESET_MASK	0x80000000
+#define PCI_CAP_RESET_SHIFT	31
+
+#define PCI_CAP_TYPE_IOMMU		0x3
+
+#define PCI_CAP_MMIO_BAR_LOW_OFFSET	0x04
+#define PCI_CAP_MMIO_BAR_HIGH_OFFSET	0x08
+#define PCI_CAP_MMIO_BAR_LOW_MASK	0xFFFFC000
+#define IOMMU_MMIO_REGION_LENGTH	0x4000
+
+#define PCI_CAP_RANGE_OFFSET		0x0C
+#define PCI_CAP_BUS_NUMBER_MASK		0x0000FF00
+#define PCI_CAP_BUS_NUMBER_SHIFT	8
+#define PCI_CAP_FIRST_DEVICE_MASK	0x00FF0000
+#define PCI_CAP_FIRST_DEVICE_SHIFT	16
+#define PCI_CAP_LAST_DEVICE_MASK	0xFF000000
+#define PCI_CAP_LAST_DEVICE_SHIFT	24
+
+#define PCI_CAP_UNIT_ID_MASK    0x0000001F
+#define PCI_CAP_UNIT_ID_SHIFT   0
+#define PCI_MISC_INFO_OFFSET    0x10
+#define PCI_CAP_MSI_NUMBER_MASK     0x0000001F
+#define PCI_CAP_MSI_NUMBER_SHIFT    0
+
+/* Device Table */
+#define IOMMU_DEV_TABLE_BASE_LOW_OFFSET		0x00
+#define IOMMU_DEV_TABLE_BASE_HIGH_OFFSET	0x04
+#define IOMMU_DEV_TABLE_BASE_LOW_MASK		0xFFFFF000
+#define IOMMU_DEV_TABLE_BASE_LOW_SHIFT		12
+#define IOMMU_DEV_TABLE_BASE_HIGH_MASK		0x000FFFFF
+#define IOMMU_DEV_TABLE_BASE_HIGH_SHIFT		0
+#define IOMMU_DEV_TABLE_SIZE_MASK		0x000001FF
+#define IOMMU_DEV_TABLE_SIZE_SHIFT		0
+
+#define IOMMU_DEV_TABLE_ENTRIES_PER_BUS		256
+#define IOMMU_DEV_TABLE_ENTRY_SIZE		32
+#define IOMMU_DEV_TABLE_U32_PER_ENTRY		(IOMMU_DEV_TABLE_ENTRY_SIZE / 4)
+
+#define IOMMU_DEV_TABLE_SYS_MGT_DMA_ABORTED	0x0
+#define IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED	0x1
+#define IOMMU_DEV_TABLE_SYS_MGT_INT_FORWARDED	0x2
+#define IOMMU_DEV_TABLE_SYS_MGT_DMA_FORWARDED	0x3
+
+#define IOMMU_DEV_TABLE_IO_CONTROL_ABORTED	0x0
+#define IOMMU_DEV_TABLE_IO_CONTROL_FORWARDED	0x1
+#define IOMMU_DEV_TABLE_IO_CONTROL_TRANSLATED	0x2
+
+#define IOMMU_DEV_TABLE_INT_CONTROL_ABORTED	0x0
+#define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
+#define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
+
+/* DeviceTable Entry[31:0] */
+#define IOMMU_DEV_TABLE_VALID_MASK			0x00000001
+#define IOMMU_DEV_TABLE_VALID_SHIFT			0
+#define IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK		0x00000002
+#define IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT		1
+#define IOMMU_DEV_TABLE_PAGING_MODE_MASK		0x00000E00
+#define IOMMU_DEV_TABLE_PAGING_MODE_SHIFT		9
+#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK		0xFFFFF000
+#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT	12
+
+/* DeviceTable Entry[63:32] */
+#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK	0x000FFFFF
+#define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT	0
+#define IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK		0x20000000
+#define IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT	29
+#define IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK	0x40000000
+#define IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT	30
+
+/* DeviceTable Entry[95:64] */
+#define IOMMU_DEV_TABLE_DOMAIN_ID_MASK	0x0000FFFF
+#define IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT	0
+
+/* DeviceTable Entry[127:96] */
+#define IOMMU_DEV_TABLE_IOTLB_SUPPORT_MASK		0x00000001
+#define IOMMU_DEV_TABLE_IOTLB_SUPPORT_SHIFT		0
+#define IOMMU_DEV_TABLE_SUPRESS_LOGGED_PAGES_MASK	0x00000002
+#define IOMMU_DEV_TABLE_SUPRESS_LOGGED_PAGES_SHIFT	1
+#define IOMMU_DEV_TABLE_SUPRESS_ALL_PAGES_MASK		0x00000004
+#define IOMMU_DEV_TABLE_SUPRESS_ALL_PAGES_SHIFT		2
+#define IOMMU_DEV_TABLE_IO_CONTROL_MASK			0x00000018
+#define IOMMU_DEV_TABLE_IO_CONTROL_SHIFT		3
+#define IOMMU_DEV_TABLE_IOTLB_CACHE_HINT_MASK		0x00000020
+#define IOMMU_DEV_TABLE_IOTLB_CACHE_HINT_SHIFT		5
+#define IOMMU_DEV_TABLE_SNOOP_DISABLE_MASK		0x00000040
+#define IOMMU_DEV_TABLE_SNOOP_DISABLE_SHIFT		6
+#define IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK		0x00000080
+#define IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT		7
+#define IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK		0x00000300
+#define IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT	8
+
+/* DeviceTable Entry[159:128] */
+#define IOMMU_DEV_TABLE_INT_VALID_MASK          0x00000001
+#define IOMMU_DEV_TABLE_INT_VALID_SHIFT         0
+#define IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK       0x0000001E
+#define IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT      1
+#define IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_MASK      0x0000000020
+#define IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_SHIFT      5
+#define IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK      0xFFFFFFC0
+#define IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT     6
+
+/* DeviceTable Entry[191:160] */
+#define IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK     0x000FFFFF
+#define IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT    0
+#define IOMMU_DEV_TABLE_INIT_PASSTHRU_MASK      0x01000000
+#define IOMMU_DEV_TABLE_INIT_PASSTHRU_SHIFT     24
+#define IOMMU_DEV_TABLE_EINT_PASSTHRU_MASK      0x02000000
+#define IOMMU_DEV_TABLE_EINT_PASSTHRU_SHIFT     25
+#define IOMMU_DEV_TABLE_NMI_PASSTHRU_MASK       0x04000000
+#define IOMMU_DEV_TABLE_NMI_PASSTHRU_SHIFT      26
+#define IOMMU_DEV_TABLE_INT_CONTROL_MASK        0x30000000
+#define IOMMU_DEV_TABLE_INT_CONTROL_SHIFT       28
+#define IOMMU_DEV_TABLE_LINT0_ENABLE_MASK       0x40000000
+#define IOMMU_DEV_TABLE_LINT0_ENABLE_SHIFT      30
+#define IOMMU_DEV_TABLE_LINT1_ENABLE_MASK       0x80000000
+#define IOMMU_DEV_TABLE_LINT1_ENABLE_SHIFT      31
+
+/* Command Buffer */
+#define IOMMU_CMD_BUFFER_BASE_LOW_OFFSET	0x08
+#define IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET	0x0C
+#define IOMMU_CMD_BUFFER_HEAD_OFFSET		0x2000
+#define IOMMU_CMD_BUFFER_TAIL_OFFSET		0x2008
+#define IOMMU_CMD_BUFFER_BASE_LOW_MASK		0xFFFFF000
+#define IOMMU_CMD_BUFFER_BASE_LOW_SHIFT		12
+#define IOMMU_CMD_BUFFER_BASE_HIGH_MASK		0x000FFFFF
+#define IOMMU_CMD_BUFFER_BASE_HIGH_SHIFT	0
+#define IOMMU_CMD_BUFFER_LENGTH_MASK		0x0F000000
+#define IOMMU_CMD_BUFFER_LENGTH_SHIFT		24
+#define IOMMU_CMD_BUFFER_HEAD_MASK		0x0007FFF0
+#define IOMMU_CMD_BUFFER_HEAD_SHIFT		4
+#define IOMMU_CMD_BUFFER_TAIL_MASK		0x0007FFF0
+#define IOMMU_CMD_BUFFER_TAIL_SHIFT		4
+
+#define IOMMU_CMD_BUFFER_ENTRY_SIZE			16
+#define IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE	8
+#define IOMMU_CMD_BUFFER_U32_PER_ENTRY 	(IOMMU_CMD_BUFFER_ENTRY_SIZE / 4)
+
+#define IOMMU_CMD_OPCODE_MASK			0xF0000000
+#define IOMMU_CMD_OPCODE_SHIFT			28
+#define IOMMU_CMD_COMPLETION_WAIT		0x1
+#define IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY	0x2
+#define IOMMU_CMD_INVALIDATE_IOMMU_PAGES	0x3
+#define IOMMU_CMD_INVALIDATE_IOTLB_PAGES	0x4
+#define IOMMU_CMD_INVALIDATE_INT_TABLE		0x5
+
+/* COMPLETION_WAIT command */
+#define IOMMU_COMP_WAIT_DATA_BUFFER_SIZE	8
+#define IOMMU_COMP_WAIT_DATA_BUFFER_ALIGNMENT	8
+#define IOMMU_COMP_WAIT_S_FLAG_MASK		0x00000001
+#define IOMMU_COMP_WAIT_S_FLAG_SHIFT		0
+#define IOMMU_COMP_WAIT_I_FLAG_MASK		0x00000002
+#define IOMMU_COMP_WAIT_I_FLAG_SHIFT		1
+#define IOMMU_COMP_WAIT_F_FLAG_MASK		0x00000004
+#define IOMMU_COMP_WAIT_F_FLAG_SHIFT		2
+#define IOMMU_COMP_WAIT_ADDR_LOW_MASK		0xFFFFFFF8
+#define IOMMU_COMP_WAIT_ADDR_LOW_SHIFT		3
+#define IOMMU_COMP_WAIT_ADDR_HIGH_MASK		0x000FFFFF
+#define IOMMU_COMP_WAIT_ADDR_HIGH_SHIFT		0
+
+/* INVALIDATE_IOMMU_PAGES command */
+#define IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK	0x0000FFFF
+#define IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT	0
+#define IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK	0x00000001
+#define IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT	0
+#define IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK	0x00000002
+#define IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT	1
+#define IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK	0xFFFFF000
+#define IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT	12
+#define IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK	0xFFFFFFFF
+#define IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT	0
+
+/* INVALIDATE_DEVTAB_ENTRY command */
+#define IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_MASK   0x0000FFFF
+#define IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_SHIFT  0
+
+/* INVALIDATE_INTERRUPT_TABLE command */
+#define IOMMU_INV_INT_TABLE_DEVICE_ID_MASK   0x0000FFFF
+#define IOMMU_INV_INT_TABLE_DEVICE_ID_SHIFT  0
+
+/* Event Log */
+#define IOMMU_EVENT_LOG_BASE_LOW_OFFSET		0x10
+#define IOMMU_EVENT_LOG_BASE_HIGH_OFFSET	0x14
+#define IOMMU_EVENT_LOG_HEAD_OFFSET		0x2010
+#define IOMMU_EVENT_LOG_TAIL_OFFSET		0x2018
+#define IOMMU_EVENT_LOG_BASE_LOW_MASK		0xFFFFF000
+#define IOMMU_EVENT_LOG_BASE_LOW_SHIFT		12
+#define IOMMU_EVENT_LOG_BASE_HIGH_MASK		0x000FFFFF
+#define IOMMU_EVENT_LOG_BASE_HIGH_SHIFT		0
+#define IOMMU_EVENT_LOG_LENGTH_MASK		0x0F000000
+#define IOMMU_EVENT_LOG_LENGTH_SHIFT		24
+#define IOMMU_EVENT_LOG_HEAD_MASK		0x0007FFF0
+#define IOMMU_EVENT_LOG_HEAD_SHIFT		4
+#define IOMMU_EVENT_LOG_TAIL_MASK		0x0007FFF0
+#define IOMMU_EVENT_LOG_TAIL_SHIFT		4
+
+#define IOMMU_EVENT_LOG_ENTRY_SIZE 			16
+#define IOMMU_EVENT_LOG_POWER_OF2_ENTRIES_PER_PAGE	8
+#define IOMMU_EVENT_LOG_U32_PER_ENTRY	(IOMMU_EVENT_LOG_ENTRY_SIZE / 4)
+
+#define IOMMU_EVENT_CODE_MASK			0xF0000000
+#define IOMMU_EVENT_CODE_SHIFT			28
+#define IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY	0x1
+#define IOMMU_EVENT_IO_PAGE_FALT		0x2
+#define IOMMU_EVENT_DEV_TABLE_HW_ERROR		0x3
+#define IOMMU_EVENT_PAGE_TABLE_HW_ERROR		0x4
+#define IOMMU_EVENT_ILLEGAL_COMMAND_ERROR	0x5
+#define IOMMU_EVENT_COMMAND_HW_ERROR		0x6
+#define IOMMU_EVENT_IOTLB_INV_TIMEOUT		0x7
+#define IOMMU_EVENT_INVALID_DEV_REQUEST		0x8
+
+#define IOMMU_EVENT_DOMAIN_ID_MASK           0x0000FFFF
+#define IOMMU_EVENT_DOMAIN_ID_SHIFT          0
+#define IOMMU_EVENT_DEVICE_ID_MASK           0x0000FFFF
+#define IOMMU_EVENT_DEVICE_ID_SHIFT          0
+
+/* Control Register */
+#define IOMMU_CONTROL_MMIO_OFFSET			0x18
+#define IOMMU_CONTROL_TRANSLATION_ENABLE_MASK		0x00000001
+#define IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT		0
+#define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK	0x00000002
+#define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT	1
+#define IOMMU_CONTROL_EVENT_LOG_ENABLE_MASK		0x00000004
+#define IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT		2
+#define IOMMU_CONTROL_EVENT_LOG_INT_MASK		0x00000008
+#define IOMMU_CONTROL_EVENT_LOG_INT_SHIFT		3
+#define IOMMU_CONTROL_COMP_WAIT_INT_MASK		0x00000010
+#define IOMMU_CONTROL_COMP_WAIT_INT_SHIFT		4
+#define IOMMU_CONTROL_TRANSLATION_CHECK_DISABLE_MASK	0x00000020
+#define IOMMU_CONTROL_TRANSLATION_CHECK_DISABLE_SHIFT	5
+#define IOMMU_CONTROL_INVALIDATION_TIMEOUT_MASK		0x000000C0
+#define IOMMU_CONTROL_INVALIDATION_TIMEOUT_SHIFT	6
+#define IOMMU_CONTROL_PASS_POSTED_WRITE_MASK		0x00000100
+#define IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT		8
+#define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK	0x00000200
+#define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT	9
+#define IOMMU_CONTROL_COHERENT_MASK			0x00000400
+#define IOMMU_CONTROL_COHERENT_SHIFT			10
+#define IOMMU_CONTROL_ISOCHRONOUS_MASK			0x00000800
+#define IOMMU_CONTROL_ISOCHRONOUS_SHIFT			11
+#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK	0x00001000
+#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT	12
+#define IOMMU_CONTROL_RESTART_MASK			0x80000000
+#define IOMMU_CONTROL_RESTART_SHIFT			31
+
+/* Exclusion Register */
+#define IOMMU_EXCLUSION_BASE_LOW_OFFSET		0x20
+#define IOMMU_EXCLUSION_BASE_HIGH_OFFSET	0x24
+#define IOMMU_EXCLUSION_LIMIT_LOW_OFFSET	0x28
+#define IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET	0x2C
+#define IOMMU_EXCLUSION_BASE_LOW_MASK		0xFFFFF000
+#define IOMMU_EXCLUSION_BASE_LOW_SHIFT		12
+#define IOMMU_EXCLUSION_BASE_HIGH_MASK		0xFFFFFFFF
+#define IOMMU_EXCLUSION_BASE_HIGH_SHIFT		0
+#define IOMMU_EXCLUSION_RANGE_ENABLE_MASK	0x00000001
+#define IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT	0
+#define IOMMU_EXCLUSION_ALLOW_ALL_MASK		0x00000002
+#define IOMMU_EXCLUSION_ALLOW_ALL_SHIFT		1
+#define IOMMU_EXCLUSION_LIMIT_LOW_MASK		0xFFFFF000
+#define IOMMU_EXCLUSION_LIMIT_LOW_SHIFT		12
+#define IOMMU_EXCLUSION_LIMIT_HIGH_MASK		0xFFFFFFFF
+#define IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT	0
+
+/* Status Register*/
+#define IOMMU_STATUS_MMIO_OFFSET		0x2020
+#define IOMMU_STATUS_EVENT_OVERFLOW_MASK	0x00000001
+#define IOMMU_STATUS_EVENT_OVERFLOW_SHIFT	0
+#define IOMMU_STATUS_EVENT_LOG_INT_MASK		0x00000002
+#define IOMMU_STATUS_EVENT_LOG_INT_SHIFT	1
+#define IOMMU_STATUS_COMP_WAIT_INT_MASK		0x00000004
+#define IOMMU_STATUS_COMP_WAIT_INT_SHIFT	2
+#define IOMMU_STATUS_EVENT_LOG_RUN_MASK		0x00000008
+#define IOMMU_STATUS_EVENT_LOG_RUN_SHIFT	3
+#define IOMMU_STATUS_CMD_BUFFER_RUN_MASK	0x00000010
+#define IOMMU_STATUS_CMD_BUFFER_RUN_SHIFT	4
+
+/* I/O Page Table */
+#define IOMMU_PAGE_TABLE_ENTRY_SIZE	8
+#define IOMMU_PAGE_TABLE_U32_PER_ENTRY	(IOMMU_PAGE_TABLE_ENTRY_SIZE / 4)
+#define IOMMU_PAGE_TABLE_ALIGNMENT	4096
+
+#define IOMMU_PTE_PRESENT_MASK			0x00000001
+#define IOMMU_PTE_PRESENT_SHIFT			0
+#define IOMMU_PTE_NEXT_LEVEL_MASK		0x00000E00
+#define IOMMU_PTE_NEXT_LEVEL_SHIFT		9
+#define IOMMU_PTE_ADDR_LOW_MASK			0xFFFFF000
+#define IOMMU_PTE_ADDR_LOW_SHIFT		12
+#define IOMMU_PTE_ADDR_HIGH_MASK		0x000FFFFF
+#define IOMMU_PTE_ADDR_HIGH_SHIFT		0
+#define IOMMU_PTE_U_MASK			0x08000000
+#define IOMMU_PTE_U_SHIFT			7
+#define IOMMU_PTE_FC_MASK			0x10000000
+#define IOMMU_PTE_FC_SHIFT			28
+#define IOMMU_PTE_IO_READ_PERMISSION_MASK	0x20000000
+#define IOMMU_PTE_IO_READ_PERMISSION_SHIFT	29
+#define IOMMU_PTE_IO_WRITE_PERMISSION_MASK	0x40000000
+#define IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT	30
+
+/* I/O Page Directory */
+#define IOMMU_PAGE_DIRECTORY_ENTRY_SIZE		8
+#define IOMMU_PAGE_DIRECTORY_ALIGNMENT		4096
+#define IOMMU_PDE_PRESENT_MASK			0x00000001
+#define IOMMU_PDE_PRESENT_SHIFT			0
+#define IOMMU_PDE_NEXT_LEVEL_MASK		0x00000E00
+#define IOMMU_PDE_NEXT_LEVEL_SHIFT		9
+#define IOMMU_PDE_ADDR_LOW_MASK			0xFFFFF000
+#define IOMMU_PDE_ADDR_LOW_SHIFT		12
+#define IOMMU_PDE_ADDR_HIGH_MASK		0x000FFFFF
+#define IOMMU_PDE_ADDR_HIGH_SHIFT		0
+#define IOMMU_PDE_IO_READ_PERMISSION_MASK	0x20000000
+#define IOMMU_PDE_IO_READ_PERMISSION_SHIFT	29
+#define IOMMU_PDE_IO_WRITE_PERMISSION_MASK	0x40000000
+#define IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT	30
+
+/* Paging modes */
+#define IOMMU_PAGING_MODE_DISABLED	0x0
+#define IOMMU_PAGING_MODE_LEVEL_0	0x0
+#define IOMMU_PAGING_MODE_LEVEL_1	0x1
+#define IOMMU_PAGING_MODE_LEVEL_2	0x2
+#define IOMMU_PAGING_MODE_LEVEL_3	0x3
+#define IOMMU_PAGING_MODE_LEVEL_4	0x4
+#define IOMMU_PAGING_MODE_LEVEL_5	0x5
+#define IOMMU_PAGING_MODE_LEVEL_6	0x6
+#define IOMMU_PAGING_MODE_LEVEL_7	0x7
+
+/* Flags */
+#define IOMMU_CONTROL_DISABLED	0
+#define IOMMU_CONTROL_ENABLED	1
+
+#define MMIO_PAGES_PER_IOMMU        (IOMMU_MMIO_REGION_LENGTH / PAGE_SIZE_4K)
+#define IOMMU_PAGES                 (MMIO_PAGES_PER_IOMMU * MAX_AMD_IOMMUS)
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH    48
+#define MAX_AMD_IOMMUS                  32
+#define IOMMU_PAGE_TABLE_LEVEL_3        3
+#define IOMMU_PAGE_TABLE_LEVEL_4        4
+#define IOMMU_IO_WRITE_ENABLED          1
+#define IOMMU_IO_READ_ENABLED           1
+#define HACK_BIOS_SETTINGS                  0
+
+/* interrupt remapping table */
+#define INT_REMAP_INDEX_DM_MASK         0x1C00
+#define INT_REMAP_INDEX_DM_SHIFT        10
+#define INT_REMAP_INDEX_VECTOR_MASK     0x3FC
+#define INT_REMAP_INDEX_VECTOR_SHIFT    2
+#define INT_REMAP_ENTRY_REMAPEN_MASK    0x00000001
+#define INT_REMAP_ENTRY_REMAPEN_SHIFT   0
+#define INT_REMAP_ENTRY_SUPIOPF_MASK    0x00000002
+#define INT_REMAP_ENTRY_SUPIOPF_SHIFT   1
+#define INT_REMAP_ENTRY_INTTYPE_MASK    0x0000001C
+#define INT_REMAP_ENTRY_INTTYPE_SHIFT   2
+#define INT_REMAP_ENTRY_REQEOI_MASK     0x00000020
+#define INT_REMAP_ENTRY_REQEOI_SHIFT    5
+#define INT_REMAP_ENTRY_DM_MASK         0x00000040
+#define INT_REMAP_ENTRY_DM_SHIFT        6
+#define INT_REMAP_ENTRY_DEST_MAST       0x0000FF00
+#define INT_REMAP_ENTRY_DEST_SHIFT      8
+#define INT_REMAP_ENTRY_VECTOR_MASK     0x00FF0000
+#define INT_REMAP_ENTRY_VECTOR_SHIFT    16
+
+#endif /* _ASM_X86_64_AMD_IOMMU_DEFS_H */
diff -Naurp xen/include/asm-x86/hvm/svm/amd-iommu-proto.h xen-redhat/include/asm-x86/hvm/svm/amd-iommu-proto.h
--- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ xen-redhat/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2007 Advanced Micro Devices, Inc.
+ * Author: Leo Duran <leo.duran@amd.com>
+ * Author: Wei Wang <wei.wang2@amd.com> - adapted to xen
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_64_AMD_IOMMU_PROTO_H
+#define _ASM_X86_64_AMD_IOMMU_PROTO_H
+
+#include <xen/sched.h>
+#include <asm/amd-iommu.h>
+#include <xen/domain_page.h>
+
+#define for_each_amd_iommu(amd_iommu) \
+    list_for_each_entry(amd_iommu, \
+        &amd_iommu_head, list)
+
+#define DMA_32BIT_MASK  0x00000000ffffffffULL
+#define PAGE_ALIGN(addr)    (((addr) + PAGE_SIZE - 1) & PAGE_MASK)
+
+extern int amd_iommu_debug;
+extern int amd_iommu_perdev_intremap;
+
+#define AMD_IOMMU_DEBUG(fmt, args...)                       \
+  do                                                        \
+  {                                                         \
+          if ( amd_iommu_debug )                            \
+              printk(XENLOG_INFO "AMD-Vi: " fmt, ## args);  \
+  } while(0)
+
+/* amd-iommu-detect functions */
+int __init amd_iommu_get_ivrs_dev_entries(void);
+int __init amd_iommu_detect_one_acpi(void *ivhd);
+int __init amd_iommu_detect_acpi(void);
+
+/* amd-iommu-init functions */
+int __init amd_iommu_init(void);
+int __init amd_iommu_init_one(struct amd_iommu *iommu);
+int __init amd_iommu_update_ivrs_mapping_acpi(void);
+void __init amd_iommu_init_cleanup(void);
+int __init amd_iommu_setup_shared_tables(void);
+
+/* mapping functions */
+int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn);
+int amd_iommu_unmap_page(struct domain *d, unsigned long gfn);
+u64 amd_iommu_get_next_table_from_pte(u32 *entry);
+int amd_iommu_reserve_domain_unity_map(struct domain *domain,
+        unsigned long phys_addr, unsigned long size, int iw, int ir);
+int amd_iommu_sync_p2m(struct domain *d);
+void invalidate_all_iommu_pages(struct domain *d);
+
+/* device table functions */
+void amd_iommu_add_dev_table_entry(
+     u32 *dte, u8 sys_mgt, u8 dev_ex, u8 lint1_pass, u8 lint0_pass,
+     u8 nmi_pass, u8 ext_int_pass, u8 init_pass);
+int amd_iommu_is_dte_page_translation_valid(u32 *entry);
+int get_dma_requestor_id(u16 bdf);
+void invalidate_dev_table_entry(struct amd_iommu *iommu, u16 devic_id);
+void amd_iommu_set_intremap_table(
+     u32 *dte, u64 intremap_ptr, u8 int_valid);
+void amd_iommu_set_root_page_table(
+     u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid);
+
+
+/* send cmd to iommu */
+int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]);
+void flush_command_buffer(struct amd_iommu *iommu);
+
+/* find iommu for bdf */
+struct amd_iommu *find_iommu_for_device(int bus, int devfn);
+
+/*interrupt remapping */
+int __init amd_iommu_setup_intremap_table(void);
+void __init amd_iommu_free_intremap_table(int bdf);
+void invalidate_interrupt_table(struct amd_iommu *iommu, u16 device_id);
+void amd_iommu_ioapic_update_ire(unsigned int apic, unsigned int reg, unsigned
+                                int value);
+void amd_iommu_msi_msg_update_ire(struct msi_desc *msi_desc, struct msi_msg *msg);
+void * __init amd_iommu_alloc_intremap_table(void);
+int __init amd_iommu_setup_ioapic_remapping(void);
+void*__init amd_iommu_alloc_intremap_table(void);
+void __init amd_iommu_free_intremap_table(int bdf);
+
+
+static inline u32 get_field_from_reg_u32(u32 reg_value, u32 mask, u32 shift)
+{
+    u32 field;
+    field = (reg_value & mask) >> shift;
+    return field;
+}
+
+static inline u32 set_field_in_reg_u32(u32 field, u32 reg_value,
+        u32 mask, u32 shift, u32 *reg)
+{
+    reg_value &= ~mask;
+    reg_value |= (field << shift) & mask;
+    if (reg)
+        *reg = reg_value;
+    return reg_value;
+}
+
+static inline u8 get_field_from_byte(u8 value, u8 mask, u8 shift)
+{
+    u8 field;
+    field = (value & mask) >> shift;
+    return field;
+}
+
+static inline unsigned long region_to_pages(unsigned long addr, unsigned long size)
+{
+    return (PAGE_ALIGN(addr + size) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static inline struct page_info* alloc_amd_iommu_pgtable(void)
+{
+    struct page_info *pg;
+    void *vaddr;
+
+    pg = alloc_domheap_page(NULL);
+    if ( pg == NULL )
+        return 0;
+    vaddr = map_domain_page(page_to_mfn(pg));
+    if ( !vaddr )
+        return 0;
+    memset(vaddr, 0, PAGE_SIZE);
+    unmap_domain_page(vaddr);
+    return pg;
+}
+
+static inline void free_amd_iommu_pgtable(struct page_info *pg)
+{
+    if ( pg != 0 )
+        free_domheap_page(pg);
+}
+
+static inline void* __alloc_amd_iommu_tables(int order)
+{
+    void *buf;
+    buf = alloc_xenheap_pages(order);
+    return buf;
+}
+
+static inline void __free_amd_iommu_tables(void *table, int order)
+{
+    free_xenheap_pages(table, order);
+}
+
+#endif /* _ASM_X86_64_AMD_IOMMU_PROTO_H */
diff -Naurp xen/include/asm-x86/hvm/svm/emulate.h xen-redhat/include/asm-x86/hvm/svm/emulate.h
--- xen/include/asm-x86/hvm/svm/emulate.h
+++ xen-redhat/include/asm-x86/hvm/svm/emulate.h
@@ -89,14 +89,14 @@ extern unsigned int decode_src_reg(u8 pr
 extern unsigned long svm_rip2pointer(struct vcpu *v);
 extern int __get_instruction_length_from_list(struct vcpu *v,
         enum instruction_index *list, unsigned int list_count, 
-        u8 *guest_eip_buf, enum instruction_index *match);
+        enum instruction_index *match);
 
 
 static inline int __get_instruction_length(struct vcpu *v, 
-        enum instruction_index instr, u8 *guest_eip_buf)
+        enum instruction_index instr)
 {
     return __get_instruction_length_from_list(
-        v, &instr, 1, guest_eip_buf, NULL);
+        v, &instr, 1, NULL);
 }
 
 
diff -Naurp xen/include/asm-x86/hvm/svm/svm.h xen-redhat/include/asm-x86/hvm/svm/svm.h
--- xen/include/asm-x86/hvm/svm/svm.h
+++ xen-redhat/include/asm-x86/hvm/svm/svm.h
@@ -28,7 +28,7 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/i387.h>
 
-extern void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb);
+void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb);
 
 #define SVM_REG_EAX (0) 
 #define SVM_REG_ECX (1) 
@@ -47,4 +47,45 @@ extern void svm_dump_vmcb(const char *fr
 #define SVM_REG_R14 (14)
 #define SVM_REG_R15 (15)
 
+static inline void svm_vmload(void *vmcb)
+{
+    asm volatile (
+        ".byte 0x0f,0x01,0xda" /* vmload */
+        : : "a" (__pa(vmcb)) : "memory" );
+}
+
+static inline void svm_vmsave(void *vmcb)
+{
+    asm volatile (
+        ".byte 0x0f,0x01,0xdb" /* vmsave */
+        : : "a" (__pa(vmcb)) : "memory" );
+}
+
+/*
+ * Need to re-inject a given event? We avoid re-injecting software exceptions
+ * and interrupts because the faulting/trapping instruction can simply be
+ * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
+ * INT3/INTO/INTn).
+ */
+static inline int svm_event_needs_reinjection(uint8_t type, uint8_t vector)
+{
+    switch ( type )
+    {
+    case EVENTTYPE_INTR:
+    case EVENTTYPE_NMI:
+        return 1;
+    case EVENTTYPE_EXCEPTION:
+        /*
+         * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
+         * check for these vectors, as they are really SW Exceptions. SVM has
+         * not updated RIP to point after the trapping instruction (INT3/INTO).
+         */
+        return (vector != 3) && (vector != 4);
+    default:
+        /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
+        break;
+    }
+    return 0;
+}
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
diff -Naurp xen/include/asm-x86/hvm/vcpu.h xen-redhat/include/asm-x86/hvm/vcpu.h
--- xen/include/asm-x86/hvm/vcpu.h
+++ xen-redhat/include/asm-x86/hvm/vcpu.h
@@ -30,12 +30,14 @@
 
 struct hvm_vcpu {
     unsigned long       hw_cr3;     /* value we give to HW to use */
-    unsigned long       ioflags;
     struct hvm_io_op    io_op;
     struct vlapic       vlapic;
     s64                 cache_tsc_offset;
     u64                 guest_time;
 
+    /* Is an NMI pending for delivery to this VCPU core? */
+    bool_t              nmi_pending; /* NB. integrate flag with save/restore */
+
     /* Lock and list for virtual platform timers. */
     spinlock_t          tm_lock;
     struct list_head    tm_list;
@@ -52,6 +54,9 @@ struct hvm_vcpu {
         struct arch_vmx_struct vmx;
         struct arch_svm_struct svm;
     } u;
+
+    /* In mode delay_for_missed_ticks, VCPUs have differing guest times.  */  
+    int64_t             stime_offset;
 };
 
 #define ARCH_HVM_IO_WAIT         1   /* Waiting for I/O completion */
diff -Naurp xen/include/asm-x86/hvm/vlapic.h xen-redhat/include/asm-x86/hvm/vlapic.h
--- xen/include/asm-x86/hvm/vlapic.h
+++ xen-redhat/include/asm-x86/hvm/vlapic.h
@@ -78,7 +78,7 @@ int vlapic_set_irq(struct vlapic *vlapic
 int vlapic_find_highest_irr(struct vlapic *vlapic);
 
 int vlapic_has_interrupt(struct vcpu *v);
-int cpu_get_apic_interrupt(struct vcpu *v, int *mode);
+int cpu_get_apic_interrupt(struct vcpu *v);
 
 int  vlapic_init(struct vcpu *v);
 void vlapic_destroy(struct vcpu *v);
@@ -89,12 +89,11 @@ void vlapic_msr_set(struct vlapic *vlapi
 
 int vlapic_accept_pic_intr(struct vcpu *v);
 
+void vlapic_adjust_i8259_target(struct domain *d);
+
 struct vlapic *apic_round_robin(
     struct domain *d, uint8_t vector, uint32_t bitmap);
 
 int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda);
 
-int is_lvtt(struct vcpu *v, int vector);
-int is_lvtt_enabled(struct vcpu *v);
-
 #endif /* __ASM_X86_HVM_VLAPIC_H__ */
diff -Naurp xen/include/asm-x86/hvm/vmx/vmcs.h xen-redhat/include/asm-x86/hvm/vmx/vmcs.h
--- xen/include/asm-x86/hvm/vmx/vmcs.h
+++ xen-redhat/include/asm-x86/hvm/vmx/vmcs.h
@@ -47,6 +47,9 @@ struct vmx_msr_state {
     unsigned long msrs[VMX_MSR_COUNT];
 };
 
+#define EPT_DEFAULT_MT      6
+#define EPT_DEFAULT_GAW     3
+
 struct arch_vmx_struct {
     /* Virtual address of VMCS. */
     struct vmcs_struct  *vmcs;
@@ -62,8 +65,21 @@ struct arch_vmx_struct {
     int                  active_cpu;
     int                  launched;
 
+    union {
+        struct {
+            u64 etmt :3,
+                gaw  :3,
+                rsvd :6,
+                asr  :52;
+        };
+        u64 eptp;
+    } ept_control;
+
     /* Cache of cpu execution control. */
     u32                  exec_control;
+    u32                  secondary_exec_control;
+
+    u16                  vpid;
 
     /* If there is vector installed in the INTR_INFO_FIELD. */
     u32                  vector_injected;
@@ -101,6 +117,8 @@ void vmx_vmcs_exit(struct vcpu *v);
 #define CPU_BASED_MWAIT_EXITING         0x00000400
 #define CPU_BASED_RDPMC_EXITING         0x00000800
 #define CPU_BASED_RDTSC_EXITING         0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING      0x00008000
+#define CPU_BASED_CR3_STORE_EXITING     0x00010000
 #define CPU_BASED_CR8_LOAD_EXITING      0x00080000
 #define CPU_BASED_CR8_STORE_EXITING     0x00100000
 #define CPU_BASED_TPR_SHADOW            0x00200000
@@ -111,7 +129,7 @@ void vmx_vmcs_exit(struct vcpu *v);
 #define CPU_BASED_ACTIVATE_MSR_BITMAP   0x10000000
 #define CPU_BASED_MONITOR_EXITING       0x20000000
 #define CPU_BASED_PAUSE_EXITING         0x40000000
-#define ACTIVATE_SECONDARY_CONTROLS     0x80000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS     0x80000000
 extern u32 vmx_cpu_based_exec_control;
 
 #define PIN_BASED_EXT_INTR_MASK         0x00000001
@@ -121,25 +139,42 @@ extern u32 vmx_pin_based_exec_control;
 
 #define VM_EXIT_IA32E_MODE              0x00000200
 #define VM_EXIT_ACK_INTR_ON_EXIT        0x00008000
+#define VM_EXIT_SAVE_GUEST_PAT          0x00040000
+#define VM_EXIT_LOAD_HOST_PAT           0x00080000
 extern u32 vmx_vmexit_control;
 
 #define VM_ENTRY_IA32E_MODE             0x00000200
 #define VM_ENTRY_SMM                    0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR     0x00000800
+#define VM_ENTRY_LOAD_GUEST_PAT         0x00004000
 extern u32 vmx_vmentry_control;
 
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
+#define SECONDARY_EXEC_ENABLE_VPID              0x00000020
+
 extern u32 vmx_secondary_exec_control;
 
 #define cpu_has_vmx_virtualize_apic_accesses \
     (vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
 #define cpu_has_vmx_tpr_shadow \
     (vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
+#define cpu_has_vmx_vnmi \
+    (vmx_pin_based_exec_control & PIN_BASED_VIRTUAL_NMIS)
 #define cpu_has_vmx_mmap_vtpr_optimization \
     (cpu_has_vmx_virtualize_apic_accesses && cpu_has_vmx_tpr_shadow)
 
 #define cpu_has_vmx_msr_bitmap \
     (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP)
+#define cpu_has_vmx_secondary_exec_control \
+    (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+#define cpu_has_vmx_ept \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)
+#define cpu_has_vmx_pat \
+    (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_PAT)
+#define cpu_has_vmx_vpid \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+
 extern char *vmx_msr_bitmap;
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
@@ -150,6 +185,7 @@ extern char *vmx_msr_bitmap;
 
 /* VMCS field encodings. */
 enum vmcs_field {
+    VIRTUAL_PROCESSOR_ID            = 0x00000000,
     GUEST_ES_SELECTOR               = 0x00000800,
     GUEST_CS_SELECTOR               = 0x00000802,
     GUEST_SS_SELECTOR               = 0x00000804,
@@ -183,10 +219,26 @@ enum vmcs_field {
     VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
     APIC_ACCESS_ADDR                = 0x00002014,
     APIC_ACCESS_ADDR_HIGH           = 0x00002015, 
+    EPT_POINTER                     = 0x0000201a,
+    EPT_POINTER_HIGH                = 0x0000201b,
+    GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+    GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
     VMCS_LINK_POINTER               = 0x00002800,
     VMCS_LINK_POINTER_HIGH          = 0x00002801,
     GUEST_IA32_DEBUGCTL             = 0x00002802,
     GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+    GUEST_PAT                       = 0x00002804,
+    GUEST_PAT_HIGH                  = 0x00002805,
+    GUEST_PDPTR0                    = 0x0000280a,
+    GUEST_PDPTR0_HIGH               = 0x0000280b,
+    GUEST_PDPTR1                    = 0x0000280c,
+    GUEST_PDPTR1_HIGH               = 0x0000280d,
+    GUEST_PDPTR2                    = 0x0000280e,
+    GUEST_PDPTR2_HIGH               = 0x0000280f,
+    GUEST_PDPTR3                    = 0x00002810,
+    GUEST_PDPTR3_HIGH               = 0x00002811,
+    HOST_PAT                        = 0x00002c00,
+    HOST_PAT_HIGH                   = 0x00002c01,
     PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
     CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
     EXCEPTION_BITMAP                = 0x00004004,
@@ -277,6 +329,8 @@ enum vmcs_field {
     HOST_RIP                        = 0x00006c16,
 };
 
+#define VMCS_VPID_WIDTH     (16)
+
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
 
 /*
diff -Naurp xen/include/asm-x86/hvm/vmx/vmx.h xen-redhat/include/asm-x86/hvm/vmx/vmx.h
--- xen/include/asm-x86/hvm/vmx/vmx.h
+++ xen-redhat/include/asm-x86/hvm/vmx/vmx.h
@@ -23,15 +23,37 @@
 #include <asm/types.h>
 #include <asm/regs.h>
 #include <asm/processor.h>
-#include <asm/hvm/vmx/vmcs.h>
 #include <asm/i387.h>
+#include <asm/hvm/support.h>
 #include <asm/hvm/trace.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+
+typedef union {
+    struct {
+        u64 r       :   1,
+        w           :   1,
+        x           :   1,
+        emt         :   3,
+        igmt        :   1,
+        sp_avail    :   1,
+        avail1      :   4,
+        mfn         :   45,
+        rsvd        :   5,
+        avail2      :   2;
+    };
+    u64 epte;
+} ept_entry_t;
+
+#define EPT_TABLE_ORDER     9
+
+extern mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t);
 
 void vmx_asm_vmexit_handler(struct cpu_user_regs);
 void vmx_asm_do_vmentry(void);
 void vmx_intr_assist(void);
 void vmx_do_resume(struct vcpu *);
-void set_guest_time(struct vcpu *v, u64 gtime);
 
 extern struct page_info *change_guest_physmap_for_vtpr(struct domain *d,
                                                        int enable_vtpr);
@@ -85,6 +107,8 @@ extern struct page_info *change_guest_ph
 
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
 
 /*
  * Interruption-information format
@@ -92,7 +116,9 @@ extern struct page_info *change_guest_ph
 #define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
 #define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
 #define INTR_INFO_DELIVER_CODE_MASK     0x800           /* 11 */
+#define INTR_INFO_NMI_UNBLOCKED_BY_IRET 0x1000          /* 12 */
 #define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
+#define INTR_INFO_RESVD_BITS_MASK       0x7ffff000
 
 #define INTR_TYPE_EXT_INTR              (0 << 8)    /* external interrupt */
 #define INTR_TYPE_NMI                   (2 << 8)    /* NMI                */
@@ -153,12 +179,15 @@ extern struct page_info *change_guest_ph
 #define VMREAD_OPCODE   ".byte 0x0f,0x78\n"
 #define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n"
 #define VMWRITE_OPCODE  ".byte 0x0f,0x79\n"
+#define INVEPT_OPCODE   ".byte 0x66,0x0f,0x38,0x80\n"   /* m128,r64/32 */
+#define INVVPID_OPCODE  ".byte 0x66,0x0f,0x38,0x81\n"   /* m128,r64/32 */
 #define VMXOFF_OPCODE   ".byte 0x0f,0x01,0xc4\n"
 #define VMXON_OPCODE    ".byte 0xf3,0x0f,0xc7\n"
 
+#define MODRM_EAX_08    ".byte 0x08\n" /* ECX, [EAX] */
 #define MODRM_EAX_06    ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */
 #define MODRM_EAX_07    ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */
-#define MODRM_EAX_ECX   ".byte 0xc1\n" /* [EAX], [ECX] */
+#define MODRM_EAX_ECX   ".byte 0xc1\n" /* EAX, ECX */
 
 static inline void __vmptrld(u64 addr)
 {
@@ -242,6 +271,42 @@ static inline void __vm_clear_bit(unsign
     __vmwrite(field, __vmread(field) & ~(1UL << bit));
 }
 
+static inline void __invept(int ext, u64 eptp, u64 gpa)
+{
+    struct {
+        u64 eptp, gpa;
+    } operand = {eptp, gpa};
+
+    __asm__ __volatile__ ( INVEPT_OPCODE
+                           MODRM_EAX_08
+                           /* CF==1 or ZF==1 --> rc = -1 */
+                           "ja 1f ; ud2 ; 1:\n"
+                           :
+                           : "a" (&operand), "c" (ext)
+                           : "memory");
+}
+
+static inline void __invvpid(int ext, u16 vpid, u64 gva)
+{
+    struct {
+        u64 vpid:16;
+        u64 rsvd:48;
+        u64 gva;
+    } __attribute__ ((packed)) operand = {vpid, 0, gva};
+
+    /* Fix up #UD exceptions which occur when TLBs are flushed before VMXON. */
+    asm volatile ( "1: " INVVPID_OPCODE MODRM_EAX_08
+                   /* CF==1 or ZF==1 --> crash (ud2) */
+                   "ja 2f ; ud2 ; 2:\n"
+                   ".section __ex_table,\"a\"\n"
+                   "    "__FIXUP_ALIGN"\n"
+                   "    "__FIXUP_WORD" 1b,2b\n"
+                   ".previous"
+                   :
+                   : "a" (&operand), "c" (ext)
+                   : "memory");
+}
+
 static inline void __vmxoff (void)
 {
     __asm__ __volatile__ ( VMXOFF_OPCODE
@@ -263,8 +328,37 @@ static inline int __vmxon (u64 addr)
     return rc;
 }
 
-static inline void __vmx_inject_exception(struct vcpu *v, int trap, int type,
-                                         int error_code, int ilen)
+static inline void ept_sync_all(void)
+{
+    if ( !hap_enabled(current->domain) )
+        return;
+
+    __invept(2, 0, 0);
+}
+
+void ept_sync_domain(struct domain *d);
+
+static inline void vpid_sync_vcpu_gva(struct vcpu *v, unsigned long gva)
+{
+    if ( cpu_has_vmx_vpid )
+        __invvpid(0, v->arch.hvm_vmx.vpid, (u64)gva);
+}
+
+static inline void vpid_sync_vcpu_all(struct vcpu *v)
+{
+    if ( cpu_has_vmx_vpid )
+        __invvpid(1, v->arch.hvm_vmx.vpid, 0);
+}
+
+static inline void vpid_sync_all(void)
+{
+    if ( cpu_has_vmx_vpid )
+        __invvpid(2, 0, 0);
+}
+
+
+static inline void __vmx_inject_exception(
+    struct vcpu *v, int trap, int type, int error_code)
 {
     unsigned long intr_fields;
 
@@ -282,9 +376,6 @@ static inline void __vmx_inject_exceptio
         intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
     }
 
-    if ( ilen )
-      __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen);
-
     __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields);
 
     if (trap == TRAP_page_fault)
@@ -297,21 +388,91 @@ static inline void vmx_inject_hw_excepti
     struct vcpu *v, int trap, int error_code)
 {
     v->arch.hvm_vmx.vector_injected = 1;
-    __vmx_inject_exception(v, trap, INTR_TYPE_HW_EXCEPTION, error_code, 0);
+    __vmx_inject_exception(v, trap, INTR_TYPE_HW_EXCEPTION, error_code);
 }
 
-static inline void vmx_inject_sw_exception(
-    struct vcpu *v, int trap, int instruction_len)
+static inline void vmx_inject_extint(struct vcpu *v, int trap)
 {
-    v->arch.hvm_vmx.vector_injected = 1;
-    __vmx_inject_exception(v, trap, INTR_TYPE_SW_EXCEPTION,
-                           VMX_DELIVER_NO_ERROR_CODE,
-                           instruction_len);
+    __vmx_inject_exception(v, trap, INTR_TYPE_EXT_INTR,
+                           VMX_DELIVER_NO_ERROR_CODE);
 }
 
-static inline void vmx_inject_extint(struct vcpu *v, int trap, int error_code)
+static inline void vmx_inject_nmi(struct vcpu *v)
+{
+    __vmx_inject_exception(v, 2, INTR_TYPE_NMI,
+                           VMX_DELIVER_NO_ERROR_CODE);
+}
+
+void ept_p2m_init(struct domain *d);
+
+/* EPT violation qualifications definitions */
+/* bit offset 0 in exit qualification */
+#define _EPT_READ_VIOLATION         0
+#define EPT_READ_VIOLATION          (1UL<<_EPT_READ_VIOLATION)
+/* bit offset 1 in exit qualification */
+#define _EPT_WRITE_VIOLATION        1
+#define EPT_WRITE_VIOLATION         (1UL<<_EPT_WRITE_VIOLATION)
+/* bit offset 2 in exit qualification */
+#define _EPT_EXEC_VIOLATION         2
+#define EPT_EXEC_VIOLATION          (1UL<<_EPT_EXEC_VIOLATION)
+
+/* bit offset 3 in exit qualification */
+#define _EPT_EFFECTIVE_READ         3
+#define EPT_EFFECTIVE_READ          (1UL<<_EPT_EFFECTIVE_READ)
+/* bit offset 4 in exit qualification */
+#define _EPT_EFFECTIVE_WRITE        4
+#define EPT_EFFECTIVE_WRITE         (1UL<<_EPT_EFFECTIVE_WRITE)
+/* bit offset 5 in exit qualification */
+#define _EPT_EFFECTIVE_EXEC         5
+#define EPT_EFFECTIVE_EXEC          (1UL<<_EPT_EFFECTIVE_EXEC)
+
+/* bit offset 6 in exit qualification */
+#define _EPT_GAW_VIOLATION          6
+#define EPT_GAW_VIOLATION           (1UL<<_EPT_GAW_VIOLATION)
+
+/* bits offset 7 & 8 in exit qualification */
+#define _EPT_GLA_VALIDITY           7
+#define EPT_GLA_VALIDITY_MASK       (3UL<<_EPT_GLA_VALIDITY)
+/* gla != gpa, when load PDPTR */
+#define EPT_GLA_VALIDITY_PDPTR_LOAD (0UL<<_EPT_GLA_VALIDITY)
+/* gla != gpa, during guest page table walking */
+#define EPT_GLA_VALIDITY_GPT_WALK   (1UL<<_EPT_GLA_VALIDITY)
+/* reserved */
+#define EPT_GLA_VALIDITY_RSVD       (2UL<<_EPT_GLA_VALIDITY)
+/* gla == gpa, normal case */
+#define EPT_GLA_VALIDITY_MATCH      (3UL<<_EPT_GLA_VALIDITY)
+
+#define EPT_EFFECTIVE_MASK          (EPT_EFFECTIVE_READ  |  \
+                                     EPT_EFFECTIVE_WRITE |  \
+                                     EPT_EFFECTIVE_EXEC)
+
+#define EPT_PAGETABLE_ENTRIES       512
+
+/*
+ * Need to re-inject a given event? We avoid re-injecting software exceptions
+ * and interrupts because the faulting/trapping instruction can simply be
+ * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
+ * INT3/INTO/INTn).
+ */
+static inline int vmx_event_needs_reinjection(uint8_t type, uint8_t vector)
 {
-    __vmx_inject_exception(v, trap, INTR_TYPE_EXT_INTR, error_code, 0);
+    switch ( type << 8 )
+    {
+    case INTR_TYPE_EXT_INTR:
+    case INTR_TYPE_NMI:
+        return 1;
+    case INTR_TYPE_HW_EXCEPTION:
+        /*
+         * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
+         * check for these vectors, as they are really SW Exceptions. SVM has
+         * not updated RIP to point after the trapping instruction (INT3/INTO).
+         */
+        return (vector != 3) && (vector != 4);
+    default:
+        /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
+        break;
+    }
+    return 0;
 }
 
 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
diff -Naurp xen/include/asm-x86/hvm/vpic.h xen-redhat/include/asm-x86/hvm/vpic.h
--- xen/include/asm-x86/hvm/vpic.h
+++ xen-redhat/include/asm-x86/hvm/vpic.h
@@ -32,7 +32,6 @@
 void vpic_irq_positive_edge(struct domain *d, int irq);
 void vpic_irq_negative_edge(struct domain *d, int irq);
 void vpic_init(struct domain *d);
-int cpu_get_pic_interrupt(struct vcpu *v, int *type);
-int is_periodic_irq(struct vcpu *v, int irq, int type);
+int cpu_get_pic_interrupt(struct vcpu *v);
 
 #endif  /* __ASM_X86_HVM_VPIC_H__ */  
diff -Naurp xen/include/asm-x86/hvm/vpt.h xen-redhat/include/asm-x86/hvm/vpt.h
--- xen/include/asm-x86/hvm/vpt.h
+++ xen-redhat/include/asm-x86/hvm/vpt.h
@@ -29,6 +29,7 @@
 #include <xen/timer.h>
 #include <xen/list.h>
 #include <asm/hvm/vpic.h>
+#include <asm/hvm/irq.h>
 #include <public/hvm/save.h>
 
 struct HPETState;
@@ -39,8 +40,9 @@ struct HPET_timer_fn_info {
 
 typedef struct HPETState {
     struct hvm_hw_hpet hpet;
-    struct vcpu *vcpu;
     uint64_t tsc_freq;
+    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
+    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
     uint64_t mc_offset;
     struct timer timers[HPET_TIMER_NUM];
     struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; 
@@ -55,11 +57,16 @@ typedef void time_cb(struct vcpu *v, voi
 
 struct periodic_time {
     struct list_head list;
-    char enabled;
-    char one_shot;              /* one shot time */
+    bool_t on_list;
+    bool_t one_shot;
+    bool_t do_not_freeze;
+    bool_t irq_issued;
+#define PTSRC_isa    1 /* ISA time source */
+#define PTSRC_lapic  2 /* LAPIC time source */
+    u8 source;                  /* PTSRC_ */
     u8 irq;
     struct vcpu *vcpu;          /* vcpu timer interrupt delivers to */
-    u32 pending_intr_nr;        /* the couner for pending timer interrupts */
+    u32 pending_intr_nr;        /* pending timer interrupts */
     u64 period;                 /* frequency in ns */
     u64 period_cycles;          /* frequency in cpu cycles */
     s_time_t scheduled;         /* scheduled timer interrupt */
@@ -102,6 +109,7 @@ typedef struct PMTState {
     struct hvm_hw_pmtimer pm;   /* 32bit timer value */
     struct vcpu *vcpu;          /* Keeps sync with this vcpu's guest-time */
     uint64_t last_gtime;        /* Last (guest) time we updated the timer */
+    uint32_t not_accounted;     /* time not accounted at last update */
     uint64_t scale;             /* Multiplier to get from tsc to timer ticks */
     struct timer timer;         /* To make sure we send SCIs */
     spinlock_t lock;
@@ -112,16 +120,38 @@ struct pl_time {    /* platform time */
     struct RTCState  vrtc;
     struct HPETState vhpet;
     struct PMTState  vpmt;
+    /* guest_time = Xen sys time + stime_offset */
+    int64_t stime_offset;
+    /* Ensures monotonicity in appropriate timer modes. */
+    uint64_t last_guest_time;
+    spinlock_t pl_time_lock;
 };
 
 #define ticks_per_sec(v) (v->domain->arch.hvm_domain.tsc_frequency)
 
-void pt_freeze_time(struct vcpu *v);
-void pt_thaw_time(struct vcpu *v);
+void pt_save_timer(struct vcpu *v);
+void pt_restore_timer(struct vcpu *v);
 void pt_update_irq(struct vcpu *v);
-void pt_intr_post(struct vcpu *v, int vector, int type);
+void pt_intr_post(struct vcpu *v, int vector, enum hvm_intack src);
 void pt_reset(struct vcpu *v);
 void pt_migrate(struct vcpu *v);
+
+void pt_adjust_global_vcpu_target(struct vcpu *v);
+#define pt_global_vcpu_target(d) \
+    ((d)->arch.hvm_domain.i8259_target ? : (d)->vcpu ? (d)->vcpu[0] : NULL)
+
+/* Is given periodic timer active? */
+#define pt_active(pt) ((pt)->on_list)
+
+/*
+ * Create/destroy a periodic (or one-shot!) timer.
+ * The given periodic timer structure must be initialised with zero bytes,
+ * except for the 'source' field which must be initialised with the
+ * correct PTSRC_ value. The initialised timer structure can then be passed
+ * to {create,destroy}_periodic_time() and number of times and in any order.
+ * Note that, for a given periodic timer, invocations of these functions MUST
+ * be serialised.
+ */
 void create_periodic_time(
     struct vcpu *v, struct periodic_time *pt, uint64_t period,
     uint8_t irq, char one_shot, time_cb *cb, void *data);
@@ -134,7 +164,6 @@ void pit_deinit(struct domain *d);
 void rtc_init(struct vcpu *v, int base);
 void rtc_migrate_timers(struct vcpu *v);
 void rtc_deinit(struct domain *d);
-int is_rtc_periodic_irq(void *opaque);
 void pmtimer_init(struct vcpu *v);
 void pmtimer_deinit(struct domain *d);
 
diff -Naurp xen/include/asm-x86/io_apic.h xen-redhat/include/asm-x86/io_apic.h
--- xen/include/asm-x86/io_apic.h
+++ xen-redhat/include/asm-x86/io_apic.h
@@ -2,9 +2,11 @@
 #define __ASM_IO_APIC_H
 
 #include <xen/config.h>
-#include <asm/fixmap.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <xen/iommu.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -18,6 +20,8 @@
 		((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \
 		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK)))
 
+#define IO_APIC_ID(idx) (mp_ioapics[idx].mpc_apicid)
+
 /*
  * The structure of the IO-APIC:
  */
@@ -121,18 +125,35 @@ extern struct mpc_config_intsrc mp_irqs[
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+/* Only need to remap ioapic RTE (reg: 10~3Fh) */
+#define ioapic_reg_remapped(reg) (iommu_enabled && ((reg) >= 0x10))
+
+static inline unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
 {
 	*IO_APIC_BASE(apic) = reg;
 	return *(IO_APIC_BASE(apic)+4);
 }
 
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+	if (ioapic_reg_remapped(reg))
+	    return io_apic_read_remap_rte(apic, reg);
+	return __io_apic_read(apic, reg);
+}
+
+static inline void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	*IO_APIC_BASE(apic) = reg;
 	*(IO_APIC_BASE(apic)+4) = value;
 }
 
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+        if (ioapic_reg_remapped(reg))
+            return iommu_update_ire_from_apic(apic, reg, value);
+        __io_apic_write(apic, reg, value);
+}
+
 /*
  * Re-write a value: to be used for read-modify-write
  * cycles where the read already set up the index register.
@@ -146,6 +167,8 @@ extern int sis_apic_bug;
 #endif
 static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
+    if (ioapic_reg_remapped(reg))
+        return iommu_update_ire_from_apic(apic, reg, value);
 	if (sis_apic_bug)
 		*IO_APIC_BASE(apic) = reg;
 	*(IO_APIC_BASE(apic)+4) = value;
@@ -179,5 +202,6 @@ static inline int ioapic_resume(void) {r
 #endif
 
 extern int assign_irq_vector(int irq);
+extern int free_irq_vector(int vector);
 
 #endif
diff -Naurp xen/include/asm-x86/irq.h xen-redhat/include/asm-x86/irq.h
--- xen/include/asm-x86/irq.h
+++ xen-redhat/include/asm-x86/irq.h
@@ -20,6 +20,8 @@
 extern int vector_irq[NR_VECTORS];
 extern u8 irq_vector[NR_IRQ_VECTORS];
 #define AUTO_ASSIGN             -1
+#define NEVER_ASSIGN            -2
+#define FREE_TO_ASSIGN          -3
 
 #define platform_legacy_irq(irq)	((irq) < 16)
 
@@ -48,7 +50,22 @@ extern unsigned long io_apic_irqs;
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
-int pirq_acktype(int irq);
-int pirq_shared(int irq);
+int pirq_acktype(struct domain *d, int irq);
+int pirq_shared(struct domain *d , int irq);
+
+int map_domain_pirq(struct domain *d, int pirq, int vector, int type,
+                           void *data);
+int unmap_domain_pirq(struct domain *d, int pirq);
+int get_free_pirq(struct domain *d, int type, int index);
+void free_domain_pirqs(struct domain *d);
+
+#define domain_irq_to_vector(d, irq) ((d)->arch.pirq_vector[irq] ?: \
+                                      IO_APIC_IRQ(irq) ? 0 : LEGACY_VECTOR(irq))
+#define domain_vector_to_irq(d, vec) ((d)->arch.vector_pirq[vec] ?: \
+                                      ((vec) < FIRST_LEGACY_VECTOR || \
+                                       (vec) > LAST_LEGACY_VECTOR) ? \
+                                      0 : LEGACY_IRQ_FROM_VECTOR(vec))
+
+int pirq_guest_force_unbind(struct domain *d, int irq);
 
 #endif /* _ASM_HW_IRQ_H */
diff -Naurp xen/include/asm-x86/ldt.h xen-redhat/include/asm-x86/ldt.h
--- xen/include/asm-x86/ldt.h
+++ xen-redhat/include/asm-x86/ldt.h
@@ -6,7 +6,6 @@
 
 static inline void load_LDT(struct vcpu *v)
 {
-    unsigned int cpu;
     struct desc_struct *desc;
     unsigned long ents;
 
@@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu 
     }
     else
     {
-        cpu = smp_processor_id();
-        desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table)
-               + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY;
+        desc = (!is_pv_32on64_vcpu(v)
+                ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
+               + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
         _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2);
-        __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) );
+        __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) );
     }
 }
 
diff -Naurp xen/include/asm-x86/mm.h xen-redhat/include/asm-x86/mm.h
--- xen/include/asm-x86/mm.h
+++ xen-redhat/include/asm-x86/mm.h
@@ -55,6 +55,41 @@ struct page_info
         u32 tlbflush_timestamp;
 
         /*
+         * When PGT_partial is true then this field is valid and indicates
+         * that PTEs in the range [0, @nr_validated_ptes) have been validated.
+         * An extra page reference must be acquired (or not dropped) whenever
+         * PGT_partial gets set, and it must be dropped when the flag gets
+         * cleared. This is so that a get() leaving a page in partially
+         * validated state (where the caller would drop the reference acquired
+         * due to the getting of the type [apparently] failing [-EAGAIN])
+         * would not accidentally result in a page left with zero general
+         * reference count, but non-zero type reference count (possible when
+         * the partial get() is followed immediately by domain destruction).
+         * Likewise, the ownership of the single type reference for partially
+         * (in-)validated pages is tied to this flag, i.e. the instance
+         * setting the flag must not drop that reference, whereas the instance
+         * clearing it will have to.
+         *
+         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
+         * been partially validated. This implies that the general reference
+         * to the page (acquired from get_page_from_lNe()) would be dropped
+         * (again due to the apparent failure) and hence must be re-acquired
+         * when resuming the validation, but must not be dropped when picking
+         * up the page for invalidation.
+         *
+         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
+         * been partially invalidated. This is basically the opposite case of
+         * above, i.e. the general reference to the page was not dropped in
+         * put_page_from_lNe() (due to the apparent failure), and hence it
+         * must be dropped when the put operation is resumed (and completes),
+         * but it must not be acquired if picking up the page for validation.
+         */
+        struct {
+            u16 nr_validated_ptes;
+            s8 partial_pte;
+        };
+
+        /*
          * Guest pages with a shadow.  This does not conflict with
          * tlbflush_timestamp since page table pages are explicitly not
          * tracked for TLB-flush avoidance when a guest runs in shadow mode.
@@ -83,9 +118,12 @@ struct page_info
  /* PAE only: is this an L2 page directory containing Xen-private mappings? */
 #define _PGT_pae_xen_l2     26
 #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
+/* Has this page been *partially* validated for use as its current type? */
+#define _PGT_partial        25
+#define PGT_partial         (1U<<_PGT_partial)
 
- /* 16-bit count of uses of this frame as its current type. */
-#define PGT_count_mask      ((1U<<16)-1)
+ /* 25-bit count of uses of this frame as its current type. */
+#define PGT_count_mask      ((1U<<25)-1)
 
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated      31
@@ -144,8 +182,9 @@ extern unsigned long max_page;
 extern unsigned long total_pages;
 void init_frametable(void);
 
-int alloc_page_type(struct page_info *page, unsigned long type);
-void free_page_type(struct page_info *page, unsigned long type);
+int alloc_page_type(struct page_info *page, unsigned long type, int preemptible);
+int free_page_type(struct page_info *page, unsigned long type,
+		    int preemptible);
 int _shadow_mode_refcounts(struct domain *d);
 
 static inline void put_page(struct page_info *page)
@@ -199,6 +238,8 @@ static inline int get_page(struct page_i
 
 void put_page_type(struct page_info *page);
 int  get_page_type(struct page_info *page, unsigned long type);
+int  put_page_type_preemptible(struct page_info *page);
+int  get_page_type_preemptible(struct page_info *page, unsigned long type);
 int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
 
@@ -208,6 +249,19 @@ static inline void put_page_and_type(str
     put_page(page);
 }
 
+static inline int put_page_and_type_preemptible(struct page_info *page,
+                                                int preemptible)
+{
+    int rc = 0;
+
+    if ( preemptible )
+        rc = put_page_type_preemptible(page);
+    else
+        put_page_type(page);
+    if ( likely(rc == 0) )
+        put_page(page);
+    return rc;
+}
 
 static inline int get_page_and_type(struct page_info *page,
                                     struct domain *domain,
@@ -394,12 +448,16 @@ int map_ldt_shadow_page(unsigned int);
 
 #ifdef CONFIG_COMPAT
 int setup_arg_xlat_area(struct vcpu *, l4_pgentry_t *);
+void domain_set_alloc_bitsize(struct domain *d);
 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits);
 #else
 # define setup_arg_xlat_area(vcpu, l4tab) 0
+# define domain_set_alloc_bitsize(d) ((void)0)
 # define domain_clamp_alloc_bitsize(d, b) (b)
 #endif
 
 unsigned long domain_get_maximum_gpfn(struct domain *d);
 
+extern struct domain *dom_xen, *dom_io;	/* for vmcoreinfo */
+
 #endif /* __ASM_X86_MM_H__ */
diff -Naurp xen/include/asm-x86/msi.h xen-redhat/include/asm-x86/msi.h
--- xen/include/asm-x86/msi.h
+++ xen-redhat/include/asm-x86/msi.h
@@ -0,0 +1,227 @@
+#ifndef __ASM_MSI_H
+#define __ASM_MSI_H
+
+#include <xen/cpumask.h>
+#include <asm/irq.h>
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT		0
+#define  MSI_DATA_VECTOR_MASK		0x000000ff
+#define	 MSI_DATA_VECTOR(v)		(((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT	8
+#define  MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define  MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT		14
+#define	 MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
+#define	 MSI_DATA_LEVEL_ASSERT		(1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT		15
+#define  MSI_DATA_TRIGGER_EDGE		(0 << MSI_DATA_TRIGGER_SHIFT)
+#define  MSI_DATA_TRIGGER_LEVEL		(1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI	    	0
+#define MSI_ADDR_BASE_LO	    	0xfee00000
+#define MSI_ADDR_HEADER             MSI_ADDR_BASE_LO
+
+#define MSI_ADDR_DESTMODE_SHIFT     2
+#define MSI_ADDR_DESTMODE_PHYS      (0 << MSI_ADDR_DESTMODE_SHIFT)
+#define MSI_ADDR_DESTMODE_LOGIC     (1 << MSI_ADDR_DESTMODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT  3
+#define MSI_ADDR_REDIRECTION_CPU    (0 << MSI_ADDR_REDIRECTION_SHIFT)
+#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+
+#define MSI_ADDR_DEST_ID_SHIFT		12
+#define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
+#define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK)
+
+/* MAX fixed pages reserved for mapping MSIX tables. */
+#if defined(__x86_64__)
+#define FIX_MSIX_MAX_PAGES              512
+#else
+#define FIX_MSIX_MAX_PAGES              32
+#endif
+
+struct msi_info {
+    int bus;
+    int devfn;
+    int vector;
+    int entry_nr;
+    uint64_t table_base;
+};
+
+struct msi_msg {
+	u32	address_lo;	/* low 32 bits of msi message address */
+	u32	address_hi;	/* high 32 bits of msi message address */
+	u32	data;		/* 16 bits of msi message data */
+};
+
+struct msi_desc;
+/* Helper functions */
+extern void mask_msi_vector(unsigned int vector);
+extern void unmask_msi_vector(unsigned int vector);
+extern void set_msi_affinity(unsigned int vector, cpumask_t mask);
+extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
+extern void pci_disable_msi(struct msi_desc *desc);
+extern void pci_cleanup_msi(struct pci_dev *pdev);
+extern int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
+extern void teardown_msi_vector(int vector);
+extern int msi_free_vector(struct msi_desc *entry);
+extern int pci_restore_msi_state(struct pci_dev *pdev);
+
+extern unsigned int pci_msix_get_table_len(struct pci_dev *pdev);
+
+struct msi_desc {
+	struct {
+		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
+		__u8	masked	: 1;
+		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
+		__u8	pos;	 	/* Location of the msi capability */
+		__u16	entry_nr;    	/* specific enabled entry 	  */
+	}msi_attrib;
+
+	struct list_head list;
+
+	void __iomem *mask_base;        /* va for the entry in mask table */
+	struct pci_dev *dev;
+	int vector;
+
+	struct msi_msg msg;		/* Last set MSI message */
+
+	int remap_index;		/* index in interrupt remapping table */
+};
+
+int msi_maskable_irq(const struct msi_desc *);
+
+/*
+ * Assume the maximum number of hot plug slots supported by the system is about
+ * ten. The worstcase is that each of these slots is hot-added with a device,
+ * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which
+ * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined
+ * as below to ensure at least one message is assigned to each detected MSI/
+ * MSI-X device function.
+ */
+#define NR_HP_RESERVED_VECTORS 	20
+
+extern struct hw_interrupt_type pci_msi_type;
+
+/*
+ * MSI-X Address Register
+ */
+#define PCI_MSIX_FLAGS_QSIZE		0x7FF
+#define PCI_MSIX_FLAGS_ENABLE		(1 << 15)
+#define PCI_MSIX_FLAGS_BIRMASK		(7 << 0)
+#define PCI_MSIX_FLAGS_BITMASK		(1 << 0)
+
+#define PCI_MSIX_ENTRY_SIZE			16
+#define  PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET	0
+#define  PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET	4
+#define  PCI_MSIX_ENTRY_DATA_OFFSET		8
+#define  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET	12
+
+#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
+#define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
+#define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
+#define msi_data_reg(base, is64bit)	\
+	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
+#define msi_mask_bits_reg(base, is64bit) \
+	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
+#define multi_msi_capable(control) \
+	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
+#define multi_msi_enable(control, num) \
+	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+#define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
+#define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
+#define msi_enable(control, num) multi_msi_enable(control, num); \
+	control |= PCI_MSI_FLAGS_ENABLE
+
+#define msix_control_reg(base)		(base + PCI_MSIX_FLAGS)
+#define msix_table_offset_reg(base)	(base + 0x04)
+#define msix_pba_offset_reg(base)	(base + 0x08)
+#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
+#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
+#define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
+#define multi_msix_capable		msix_table_size
+#define msix_unmask(address)	 	(address & ~PCI_MSIX_FLAGS_BITMASK)
+#define msix_mask(address)		(address | PCI_MSIX_FLAGS_BITMASK)
+#define msix_is_pending(address) 	(address & PCI_MSIX_FLAGS_PENDMASK)
+
+/*
+ * MSI Defined Data Structures
+ */
+#define MSI_ADDRESS_HEADER		0xfee
+#define MSI_ADDRESS_HEADER_SHIFT	12
+#define MSI_ADDRESS_HEADER_MASK		0xfff000
+#define MSI_ADDRESS_DEST_ID_MASK	0xfff0000f
+#define MSI_TARGET_CPU_MASK		0xff
+#define MSI_TARGET_CPU_SHIFT		12
+#define MSI_DELIVERY_MODE		0
+#define MSI_LEVEL_MODE			1	/* Edge always assert */
+#define MSI_TRIGGER_MODE		0	/* MSI is edge sensitive */
+#define MSI_PHYSICAL_MODE		0
+#define MSI_LOGICAL_MODE		1
+#define MSI_REDIRECTION_HINT_MODE	0
+
+#define __LITTLE_ENDIAN_BITFIELD	1
+
+struct msg_data {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32	vector		:  8;
+	__u32	delivery_mode	:  3;	/* 000b: FIXED | 001b: lowest prior */
+	__u32	reserved_1	:  3;
+	__u32	level		:  1;	/* 0: deassert | 1: assert */
+	__u32	trigger		:  1;	/* 0: edge | 1: level */
+	__u32	reserved_2	: 16;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u32	reserved_2	: 16;
+	__u32	trigger		:  1;	/* 0: edge | 1: level */
+	__u32	level		:  1;	/* 0: deassert | 1: assert */
+	__u32	reserved_1	:  3;
+	__u32	delivery_mode	:  3;	/* 000b: FIXED | 001b: lowest prior */
+	__u32	vector		:  8;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+} __attribute__ ((packed));
+
+struct msg_address {
+	union {
+		struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+			__u32	reserved_1	:  2;
+			__u32	dest_mode	:  1;	/*0:physic | 1:logic */
+			__u32	redirection_hint:  1;  	/*0: dedicated CPU
+							  1: lowest priority */
+			__u32	reserved_2	:  4;
+ 			__u32	dest_id		: 24;	/* Destination ID */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ 			__u32	dest_id		: 24;	/* Destination ID */
+			__u32	reserved_2	:  4;
+			__u32	redirection_hint:  1;  	/*0: dedicated CPU
+							  1: lowest priority */
+			__u32	dest_mode	:  1;	/*0:physic | 1:logic */
+			__u32	reserved_1	:  2;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+      		}u;
+       		__u32  value;
+	}lo_address;
+	__u32 	hi_address;
+} __attribute__ ((packed));
+
+#endif /* __ASM_MSI_H */
diff -Naurp xen/include/asm-x86/msr.h xen-redhat/include/asm-x86/msr.h
--- xen/include/asm-x86/msr.h
+++ xen-redhat/include/asm-x86/msr.h
@@ -121,6 +121,7 @@ static inline void wrmsrl(unsigned int m
 #define MSR_IA32_VMX_CR4_FIXED0                 0x488
 #define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS        0x48e
 #define IA32_FEATURE_CONTROL_MSR                0x3a
 #define IA32_FEATURE_CONTROL_MSR_LOCK           0x1
 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON   0x4
@@ -149,16 +150,14 @@ static inline void wrmsrl(unsigned int m
 
 #ifndef __ASSEMBLY__
 
-DECLARE_PER_CPU(__u64, efer);
+DECLARE_PER_CPU(u64, efer);
 
-static inline __u64 read_efer(void)
+static inline u64 read_efer(void)
 {
-    if (!this_cpu(efer))
-        rdmsrl(MSR_EFER, this_cpu(efer));
     return this_cpu(efer);
 }
 
-static inline void write_efer(__u64 val)
+static inline void write_efer(u64 val)
 {
     this_cpu(efer) = val;
     wrmsrl(MSR_EFER, val);
@@ -199,6 +198,9 @@ static inline void write_efer(__u64 val)
 #define MSR_IA32_PERF_STATUS		0x198
 #define MSR_IA32_PERF_CTL		0x199
 
+#define MSR_IA32_MPERF			0x000000e7
+#define MSR_IA32_APERF			0x000000e8
+
 #define MSR_IA32_THERM_CONTROL		0x19a
 #define MSR_IA32_THERM_INTERRUPT	0x19b
 #define MSR_IA32_THERM_STATUS		0x19c
@@ -207,6 +209,8 @@ static inline void write_efer(__u64 val)
 #define MSR_IA32_MISC_ENABLE_PERF_AVAIL   (1<<7)
 #define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL  (1<<11)
 #define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
+#define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18)
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23)
 
 #define MSR_IA32_DEBUGCTLMSR		0x1d9
 #define MSR_IA32_LASTBRANCHFROMIP	0x1db
@@ -214,6 +218,8 @@ static inline void write_efer(__u64 val)
 #define MSR_IA32_LASTINTFROMIP		0x1dd
 #define MSR_IA32_LASTINTTOIP		0x1de
 
+#define MSR_IA32_CR_PAT             0x00000277
+
 #define MSR_IA32_MC0_CTL		0x400
 #define MSR_IA32_MC0_STATUS		0x401
 #define MSR_IA32_MC0_ADDR		0x402
@@ -349,6 +355,18 @@ static inline void write_efer(__u64 val)
 #define MSR_K7_CLK_CTL			0xC001001b
 #define MSR_K7_FID_VID_CTL		0xC0010041
 #define MSR_K7_FID_VID_STATUS		0xC0010042
+#define MSR_K8_PSTATE_LIMIT            0xc0010061
+#define MSR_K8_PSTATE_CTRL             0xc0010062
+#define MSR_K8_PSTATE_STATUS           0xc0010063
+#define MSR_K8_PSTATE0                 0xc0010064
+#define MSR_K8_PSTATE1                 0xc0010065
+#define MSR_K8_PSTATE2                 0xc0010066
+#define MSR_K8_PSTATE3                 0xc0010067
+#define MSR_K8_PSTATE4                 0xc0010068
+#define MSR_K8_PSTATE5                 0xc0010069
+#define MSR_K8_PSTATE6                 0xc001006A
+#define MSR_K8_PSTATE7                 0xc001006B
+#define MSR_K8_ENABLE_C1E              0xc0010055
 
 #define MSR_K8_TOP_MEM1			0xC001001A
 #define MSR_K8_TOP_MEM2			0xC001001D
@@ -357,6 +375,9 @@ static inline void write_efer(__u64 val)
 #define MSR_K8_VM_CR			0xC0010114
 #define MSR_K8_VM_HSAVE_PA		0xC0010117
 
+#define MSR_K8_FIDVID_CTL		0xC0010041
+#define MSR_K8_FIDVID_STATUS		0xC0010042
+
 /* MSR_K8_VM_CR bits: */
 #define _K8_VMCR_SVME_DISABLE		4
 #define K8_VMCR_SVME_DISABLE		(1 << _K8_VMCR_SVME_DISABLE)
diff -Naurp xen/include/asm-x86/mtrr.h xen-redhat/include/asm-x86/mtrr.h
--- xen/include/asm-x86/mtrr.h
+++ xen-redhat/include/asm-x86/mtrr.h
@@ -18,5 +18,8 @@ extern int mtrr_add_page(unsigned long b
 extern int mtrr_del(int reg, unsigned long base, unsigned long size);
 extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
+extern int hold_mtrr_updates_on_aps;
+extern void mtrr_aps_sync_begin(void);
+extern void mtrr_aps_sync_end(void);
 
 #endif /* __ASM_X86_MTRR_H__ */
diff -Naurp xen/include/asm-x86/numa.h xen-redhat/include/asm-x86/numa.h
--- xen/include/asm-x86/numa.h
+++ xen-redhat/include/asm-x86/numa.h
@@ -52,7 +52,7 @@ struct node_data {
 
 extern struct node_data node_data[];
 
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
+static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
 { 
 	unsigned nid; 
 	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
diff -Naurp xen/include/asm-x86/p2m.h xen-redhat/include/asm-x86/p2m.h
--- xen/include/asm-x86/p2m.h
+++ xen-redhat/include/asm-x86/p2m.h
@@ -26,6 +26,78 @@
 #ifndef _XEN_P2M_H
 #define _XEN_P2M_H
 
+#include <asm/paging.h>
+
+#if 1 /* XEN_VERSION == 3 && XEN_SUBVERSION < 2 */
+
+typedef enum {
+    p2m_invalid = 0,            /* Nothing mapped here */
+    p2m_ram_rw = 1,             /* Normal read/write guest RAM */
+    p2m_ram_logdirty = 2,       /* Temporarily read-only for log-dirty */
+    p2m_ram_ro = 3,             /* Read-only; writes go to the device model */
+    p2m_mmio_dm = 4,            /* Reads and write go to the device model */
+    p2m_mmio_direct = 5,        /* Read/write mapping of genuine MMIO area */
+} p2m_type_t;
+
+/* We use bitmaps and maks to handle groups of types */
+#define p2m_to_mask(_t) (1UL << (_t))
+
+/* RAM types, which map to real machine frames */
+#define P2M_RAM_TYPES (p2m_to_mask(p2m_ram_rw)          \
+                       | p2m_to_mask(p2m_ram_logdirty)  \
+                       | p2m_to_mask(p2m_ram_ro))
+
+/* MMIO types, which don't have to map to anything in the frametable */
+#define P2M_MMIO_TYPES (p2m_to_mask(p2m_mmio_dm)        \
+                        | p2m_to_mask(p2m_mmio_direct))
+
+/* Read-only types, which must have the _PAGE_RW bit clear in their PTEs */
+#define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
+                      | p2m_to_mask(p2m_ram_ro))
+
+/* Useful predicates */
+#define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
+#define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
+#define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+
+/* PTE flags for the various types of p2m entry */
+#define P2M_BASE_FLAGS \
+        (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
+
+/* Extract the type from the PTE flags that store it */
+static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
+{
+    if ( (flags & _PAGE_RW) && (flags & _PAGE_PCD) )
+        return p2m_mmio_direct;
+    else if ( flags & _PAGE_RW )
+        return p2m_ram_rw;
+    else if ( paging_mode_log_dirty(current->domain) )
+        return p2m_ram_logdirty;
+    else
+        return p2m_invalid;
+}
+
+static inline unsigned long p2m_type_to_flags(p2m_type_t t)
+{
+    unsigned long flags = 0;
+    switch(t)
+    {
+    case p2m_ram_rw:
+        return flags | P2M_BASE_FLAGS | _PAGE_RW;
+    case p2m_mmio_direct:
+        return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
+    case p2m_ram_logdirty:
+    case p2m_ram_ro:
+    case p2m_mmio_dm:
+        return flags | P2M_BASE_FLAGS;
+    case p2m_invalid:
+    default:
+        return flags;
+    }
+}
+
+#endif
 
 /* The phys_to_machine_mapping is the reversed mapping of MPT for full
  * virtualization.  It is only used by shadow_mode_translate()==true
@@ -38,8 +110,19 @@
 /* Read the current domain's P2M table. */
 static inline mfn_t gfn_to_mfn_current(unsigned long gfn)
 {
+    return current->domain->arch.p2m.get_entry_fast(gfn);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static inline mfn_t p2m_gfn_to_mfn_fast(unsigned long gfn)
+{
     l1_pgentry_t l1e = l1e_empty();
+    l2_pgentry_t l2e = l2e_empty();
     int ret;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+#if CONFIG_PAGING_LEVELS >= 4
+    l3_pgentry_t l3e = l3e_empty();
+#endif
 
     if ( gfn > current->domain->arch.p2m.max_mapped_pfn )
         return _mfn(INVALID_MFN);
@@ -47,18 +130,50 @@ static inline mfn_t gfn_to_mfn_current(u
     /* Don't read off the end of the p2m table */
     ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t));
 
-    ret = __copy_from_user(&l1e,
-                           &phys_to_machine_mapping[gfn],
-                           sizeof(l1e));
 
-    if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
-        return _mfn(l1e_get_pfn(l1e));
+#if CONFIG_PAGING_LEVELS >= 4
+    /* check whether 1GB is available or not */
+    ret = __copy_from_user(&l3e,
+                           &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START) + l3_linear_offset(addr)],
+                           sizeof(l3e));
+    if ( (ret == 0) && (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+    {
+	return _mfn(l3e_get_pfn(l3e) +
+		    l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+		    l1_table_offset(addr));
+    }
+#endif
+
+    /* check 2MB entry */
+    ret = __copy_from_user(&l2e,
+                           &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                           sizeof(l2e));
+
+    if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+    {
+        return _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+    }
+    else
+    {
+        ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[gfn],
+                               sizeof(l1e));
+        
+        if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
+            return _mfn(l1e_get_pfn(l1e));
+    }
 
     return _mfn(INVALID_MFN);
 }
 
 /* Read another domain's P2M table, mapping pages as we go */
-mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
+static inline
+mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+{
+    return d->arch.p2m.get_entry(d, gpfn);
+}
 
 /* General conversion function from gfn to mfn */
 static inline mfn_t gfn_to_mfn(struct domain *d, unsigned long gfn)
@@ -104,6 +219,9 @@ gl1e_to_ml1e(struct domain *d, l1_pgentr
 }
 
 
+/* Set mmio addresses in the p2m table (for pass-through) */
+int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);
 
 /* Init the datastructures for later use by the p2m code */
 void p2m_init(struct domain *d);
@@ -122,16 +240,18 @@ int p2m_alloc_table(struct domain *d,
 void p2m_teardown(struct domain *d);
 
 /* Add a page to a domain's p2m table */
-void guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                            unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gfn,
+                            unsigned long mfn, int order);
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, int order);
 
 /* set P2M table l1e flags */
 void p2m_set_flags_global(struct domain *d, u32 l1e_flags);
 
+void p2m_change_entry_type_global(struct domain *d, u32 l1e_flags);
+
 /* set P2M table l1e flags for a gpa */
 int p2m_set_flags(struct domain *d, paddr_t gpa, u32 l1e_flags);
 
diff -Naurp xen/include/asm-x86/page.h xen-redhat/include/asm-x86/page.h
--- xen/include/asm-x86/page.h
+++ xen-redhat/include/asm-x86/page.h
@@ -294,20 +294,6 @@ void paging_init(void);
 void setup_idle_pagetable(void);
 #endif /* !defined(__ASSEMBLY__) */
 
-#define __pge_off()                                                     \
-    do {                                                                \
-        __asm__ __volatile__(                                           \
-            "mov %0, %%cr4;  # turn off PGE     "                       \
-            : : "r" (mmu_cr4_features & ~X86_CR4_PGE) );                \
-        } while ( 0 )
-
-#define __pge_on()                                                      \
-    do {                                                                \
-        __asm__ __volatile__(                                           \
-            "mov %0, %%cr4;  # turn off PGE     "                       \
-            : : "r" (mmu_cr4_features) );                               \
-    } while ( 0 )
-
 #define _PAGE_PRESENT  0x001U
 #define _PAGE_RW       0x002U
 #define _PAGE_USER     0x004U
diff -Naurp xen/include/asm-x86/paging.h xen-redhat/include/asm-x86/paging.h
--- xen/include/asm-x86/paging.h
+++ xen-redhat/include/asm-x86/paging.h
@@ -36,6 +36,9 @@
 /*****************************************************************************
  * Macros to tell which paging mode a domain is in */
 
+#define hap_enabled(d) (hvm_funcs.hap_supported && is_hvm_domain(d))
+#define hap_1gb_pgtb(d) (hvm_funcs.hap_1gb_pgtb && is_hvm_domain(d))
+
 #define PG_SH_shift    20
 #define PG_HAP_shift   21
 /* We're in one of the shadow modes */
diff -Naurp xen/include/asm-x86/processor.h xen-redhat/include/asm-x86/processor.h
--- xen/include/asm-x86/processor.h
+++ xen-redhat/include/asm-x86/processor.h
@@ -8,6 +8,8 @@
 #include <xen/config.h>
 #include <xen/cache.h>
 #include <xen/types.h>
+#include <xen/smp.h>
+#include <xen/percpu.h>
 #include <public/xen.h>
 #include <asm/types.h>
 #include <asm/cpufeature.h>
@@ -194,6 +196,7 @@ extern int phys_proc_id[NR_CPUS];
 extern int cpu_core_id[NR_CPUS];
 
 extern void identify_cpu(struct cpuinfo_x86 *);
+extern void setup_clear_cpu_cap(unsigned int);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern void dodgy_tsc(void);
@@ -296,16 +299,17 @@ static inline unsigned long read_cr2(voi
     return __cr2;
 }
 
+DECLARE_PER_CPU(unsigned long, cr4);
+
 static inline unsigned long read_cr4(void)
 {
-    unsigned long __cr4;
-    __asm__("mov %%cr4,%0\n\t" :"=r" (__cr4));
-    return __cr4;
-} 
+    return this_cpu(cr4);
+}
     
 static inline void write_cr4(unsigned long val)
 {
-	__asm__("mov %0,%%cr4": :"r" ((unsigned long)val));
+    this_cpu(cr4) = val;
+    asm volatile ( "mov %0,%%cr4" : : "r" (val) );
 }
 
 
@@ -331,24 +335,14 @@ extern unsigned long mmu_cr4_features;
 
 static always_inline void set_in_cr4 (unsigned long mask)
 {
-    unsigned long dummy;
     mmu_cr4_features |= mask;
-    __asm__ __volatile__ (
-        "mov %%cr4,%0\n\t"
-        "or %1,%0\n\t"
-        "mov %0,%%cr4\n"
-        : "=&r" (dummy) : "irg" (mask) );
+    write_cr4(read_cr4() | mask);
 }
 
 static always_inline void clear_in_cr4 (unsigned long mask)
 {
-    unsigned long dummy;
     mmu_cr4_features &= ~mask;
-    __asm__ __volatile__ (
-        "mov %%cr4,%0\n\t"
-        "and %1,%0\n\t"
-        "mov %0,%%cr4\n"
-        : "=&r" (dummy) : "irg" (~mask) );
+    write_cr4(read_cr4() & ~mask);
 }
 
 /*
diff -Naurp xen/include/asm-x86/smp.h xen-redhat/include/asm-x86/smp.h
--- xen/include/asm-x86/smp.h
+++ xen-redhat/include/asm-x86/smp.h
@@ -13,7 +13,6 @@
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifndef __ASSEMBLY__
-#include <asm/fixmap.h>
 #include <asm/bitops.h>
 #include <asm/mpspec.h>
 #ifdef CONFIG_X86_IO_APIC
diff -Naurp xen/include/asm-x86/spinlock.h xen-redhat/include/asm-x86/spinlock.h
--- xen/include/asm-x86/spinlock.h
+++ xen-redhat/include/asm-x86/spinlock.h
@@ -8,22 +8,22 @@
 
 typedef struct {
     volatile s16 lock;
-    s8 recurse_cpu;
+    u8 recurse_cpu;
     u8 recurse_cnt;
 } spinlock_t;
 
 #define SPIN_LOCK_UNLOCKED /*(spinlock_t)*/ { 1, -1, 0 }
 
 #define spin_lock_init(x)	do { *(x) = (spinlock_t) SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(x)	(*(volatile char *)(&(x)->lock) <= 0)
+#define spin_is_locked(x)	((x)->lock <= 0)
 
 static inline void _raw_spin_lock(spinlock_t *lock)
 {
     __asm__ __volatile__ (
-        "1:  lock; decb %0         \n"
+        "1:  lock; decw %0         \n"
         "    js 2f                 \n"
         ".section .text.lock,\"ax\"\n"
-        "2:  cmpb $0,%0            \n"
+        "2:  cmpw $0,%0            \n"
         "    rep; nop              \n"
         "    jle 2b                \n"
         "    jmp 1b                \n"
@@ -36,23 +36,23 @@ static inline void _raw_spin_unlock(spin
 #if !defined(CONFIG_X86_OOSTORE)
     ASSERT(spin_is_locked(lock));
     __asm__ __volatile__ (
-	"movb $1,%0" 
+	"movw $1,%0"
         : "=m" (lock->lock) : : "memory" );
 #else
-    char oldval = 1;
+    s16 oldval = 1;
     ASSERT(spin_is_locked(lock));
     __asm__ __volatile__ (
-	"xchgb %b0, %1"
-        : "=q" (oldval), "=m" (lock->lock) : "0" (oldval) : "memory" );
+	"xchgw %w0, %1"
+        : "=r" (oldval), "=m" (lock->lock) : "0" (oldval) : "memory" );
 #endif
 }
 
 static inline int _raw_spin_trylock(spinlock_t *lock)
 {
-    char oldval;
+    s16 oldval;
     __asm__ __volatile__(
-        "xchgb %b0,%1"
-        :"=q" (oldval), "=m" (lock->lock)
+        "xchgw %w0,%1"
+        :"=r" (oldval), "=m" (lock->lock)
         :"0" (0) : "memory");
     return oldval > 0;
 }
diff -Naurp xen/include/asm-x86/system.h xen-redhat/include/asm-x86/system.h
--- xen/include/asm-x86/system.h
+++ xen-redhat/include/asm-x86/system.h
@@ -14,6 +14,9 @@
 #define wbinvd() \
 	__asm__ __volatile__ ("wbinvd": : :"memory");
 
+#define clflush(a) \
+	__asm__ __volatile__ ("clflush (%0)": :"r"(a));
+
 #define nop() __asm__ __volatile__ ("nop")
 
 #define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
diff -Naurp xen/include/asm-x86/time.h xen-redhat/include/asm-x86/time.h
--- xen/include/asm-x86/time.h
+++ xen-redhat/include/asm-x86/time.h
@@ -26,4 +26,9 @@ extern int time_resume(void);
 
 extern void init_percpu_time(void);
 
+struct ioreq;
+int dom0_pit_access(struct ioreq *ioreq);
+
+int cpu_frequency_change(u64 freq);
+
 #endif /* __X86_TIME_H__ */
diff -Naurp xen/include/asm-x86/x86_32/elf.h xen-redhat/include/asm-x86/x86_32/elf.h
--- xen/include/asm-x86/x86_32/elf.h
+++ xen-redhat/include/asm-x86/x86_32/elf.h
@@ -1,8 +1,6 @@
 #ifndef __X86_32_ELF_H__
 #define __X86_32_ELF_H__
 
-#include <asm/processor.h>
-
 typedef struct {
     unsigned long ebx;
     unsigned long ecx;
@@ -40,7 +38,7 @@ static inline void elf_core_save_regs(EL
     asm volatile("movw %%fs, %%ax;" :"=a"(core_regs->fs));
     asm volatile("movw %%gs, %%ax;" :"=a"(core_regs->gs));
     /* orig_eax not filled in for now */
-    core_regs->eip = (unsigned long)current_text_addr();
+    core_regs->eip = (unsigned long)elf_core_save_regs;
     asm volatile("movw %%cs, %%ax;" :"=a"(core_regs->cs));
     asm volatile("pushfl; popl %0" :"=m"(core_regs->eflags));
     asm volatile("movl %%esp,%0" : "=m"(core_regs->esp));
diff -Naurp xen/include/asm-x86/x86_64/elf.h xen-redhat/include/asm-x86/x86_64/elf.h
--- xen/include/asm-x86/x86_64/elf.h
+++ xen-redhat/include/asm-x86/x86_64/elf.h
@@ -1,8 +1,6 @@
 #ifndef __X86_64_ELF_H__
 #define __X86_64_ELF_H__
 
-#include <asm/processor.h>
-
 typedef struct {
     unsigned long r15;
     unsigned long r14;
@@ -54,7 +52,7 @@ static inline void elf_core_save_regs(EL
     asm volatile("movq %%rsi,%0" : "=m"(core_regs->rsi));
     asm volatile("movq %%rdi,%0" : "=m"(core_regs->rdi));
     /* orig_rax not filled in for now */
-    core_regs->rip = (unsigned long)current_text_addr();
+    core_regs->rip = (unsigned long)elf_core_save_regs;
     asm volatile("movl %%cs, %%eax;" :"=a"(core_regs->cs));
     asm volatile("pushfq; popq %0" :"=m"(core_regs->eflags));
     asm volatile("movq %%rsp,%0" : "=m"(core_regs->rsp));
diff -Naurp xen/include/asm-x86/x86_64/uaccess.h xen-redhat/include/asm-x86/x86_64/uaccess.h
--- xen/include/asm-x86/x86_64/uaccess.h
+++ xen-redhat/include/asm-x86/x86_64/uaccess.h
@@ -8,7 +8,7 @@
  * non-canonical address (and thus fault) before ever reaching VIRT_START.
  */
 #define __addr_ok(addr) \
-    (((unsigned long)(addr) < (1UL<<48)) || \
+    (((unsigned long)(addr) < (1UL<<47)) || \
      ((unsigned long)(addr) >= HYPERVISOR_VIRT_END))
 
 #define access_ok(addr, size) (__addr_ok(addr))
diff -Naurp xen/include/Makefile xen-redhat/include/Makefile
--- xen/include/Makefile
+++ xen-redhat/include/Makefile
@@ -19,7 +19,8 @@ headers-y := \
     compat/version.h \
     compat/xen.h \
     compat/xencomm.h \
-    compat/xenoprof.h
+    compat/xenoprof.h \
+    compat/stratus.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
 headers-y                 += compat/arch-$(compat-arch-y).h compat/xlat.h
diff -Naurp xen/include/public/domctl.h xen-redhat/include/public/domctl.h
--- xen/include/public/domctl.h
+++ xen-redhat/include/public/domctl.h
@@ -432,7 +432,121 @@ struct xen_domctl_sendtrigger {
 typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
 
- 
+
+/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
+#define XEN_DOMCTL_assign_device      37
+#define XEN_DOMCTL_test_assign_device 45
+#define XEN_DOMCTL_deassign_device 47
+struct xen_domctl_assign_device {
+    uint32_t  machine_bdf;   /* machine PCI ID of assigned device */
+};
+typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
+
+/* Retrieve sibling devices infomation of machine_bdf */
+#define XEN_DOMCTL_get_device_group 50
+struct xen_domctl_get_device_group {
+    uint32_t  machine_bdf;      /* IN */
+    uint32_t  max_sdevs;        /* IN */
+    uint32_t  num_sdevs;        /* OUT */
+    XEN_GUEST_HANDLE_64(uint32_t)  sdev_array;   /* OUT */
+};
+typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t);
+
+
+/* Pass-through interrupts: bind real irq -> hvm devfn. */
+#define XEN_DOMCTL_bind_pt_irq       38
+#define XEN_DOMCTL_unbind_pt_irq     48
+typedef enum pt_irq_type_e {
+    PT_IRQ_TYPE_PCI,
+    PT_IRQ_TYPE_ISA,
+    PT_IRQ_TYPE_MSI,
+} pt_irq_type_t;
+struct xen_domctl_bind_pt_irq {
+    uint32_t machine_irq;
+    pt_irq_type_t irq_type;
+    uint32_t hvm_domid;
+
+    union {
+        struct {
+            uint8_t isa_irq;
+        } isa;
+        struct {
+            uint8_t bus;
+            uint8_t device;
+            uint8_t intx;
+        } pci;
+        struct {
+            uint8_t gvec;
+            uint32_t gflags;
+        } msi;
+    } u;
+};
+typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t);
+
+
+/* Bind machine I/O address range -> HVM address range. */
+#define XEN_DOMCTL_memory_mapping    39
+#define DPCI_ADD_MAPPING         1
+#define DPCI_REMOVE_MAPPING      0
+struct xen_domctl_memory_mapping {
+    uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */
+    uint64_aligned_t first_mfn; /* first page (machine page) in range */
+    uint64_aligned_t nr_mfns;   /* number of pages in range (>0) */
+    uint32_t add_mapping;       /* add or remove mapping */
+    uint32_t padding;           /* padding for 64-bit aligned structure */
+};
+typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t);
+
+#define XEN_DOMCTL_gdbsx_guestmemio     1000 /* guest mem io */
+struct xen_domctl_gdbsx_memio {
+    uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
+    uint64_aligned_t gva;    /* guest virtual address */
+    uint64_aligned_t uva;    /* user buffer virtual address */
+    int              len;    /* number of bytes to read/write */
+    int              gwr;    /* 0 = read from guest. 1 = write to guest */
+    int              remain; /* bytes remaining to be copied */
+};
+
+#define XEN_DOMCTL_gdbsx_pausevcpu   1001
+#define XEN_DOMCTL_gdbsx_unpausevcpu 1002
+struct xen_domctl_gdbsx_pauseunp_vcpu { /* pause/unpause a vcpu */
+    uint32_t         vcpu;         /* which vcpu */
+};
+
+#define XEN_DOMCTL_gdbsx_domstatus   1003
+struct xen_domctl_gdbsx_domstatus {
+    int              paused;     /* is the domain paused */
+    uint32_t         vcpu_id;    /* any vcpu in an event? */
+    uint32_t         vcpu_ev;    /* if yes, what event? */
+
+};
+
+/* Bind machine I/O port range -> HVM I/O port range. */
+#define XEN_DOMCTL_ioport_mapping    40
+struct xen_domctl_ioport_mapping {
+    uint32_t first_gport;     /* first guest IO port*/
+    uint32_t first_mport;     /* first machine IO port */
+    uint32_t nr_ports;        /* size of port range */
+    uint32_t add_mapping;     /* add or remove mapping */
+};
+typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t);
+
+/*
+ * Request a particular record from the HVM context
+ */
+#define XEN_DOMCTL_gethvmcontext_partial   55
+typedef struct xen_domctl_hvmcontext_partial {
+    uint32_t type;                        /* IN: Type of record required */
+    uint32_t instance;                    /* IN: Instance of that type */
+    XEN_GUEST_HANDLE_64(uint8_t) buffer;  /* OUT: buffer to write record into */
+} xen_domctl_hvmcontext_partial_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+
 struct xen_domctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
@@ -460,8 +574,17 @@ struct xen_domctl {
         struct xen_domctl_settimeoffset     settimeoffset;
         struct xen_domctl_real_mode_area    real_mode_area;
         struct xen_domctl_hvmcontext        hvmcontext;
+        struct xen_domctl_hvmcontext_partial hvmcontext_partial;
         struct xen_domctl_address_size      address_size;
         struct xen_domctl_sendtrigger       sendtrigger;
+        struct xen_domctl_get_device_group  get_device_group;
+        struct xen_domctl_assign_device     assign_device;
+        struct xen_domctl_bind_pt_irq       bind_pt_irq;
+        struct xen_domctl_memory_mapping    memory_mapping;
+        struct xen_domctl_ioport_mapping    ioport_mapping;
+        struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
+        struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
+        struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
         uint8_t                             pad[128];
     } u;
 };
diff -Naurp xen/include/public/features.h xen-redhat/include/public/features.h
--- xen/include/public/features.h
+++ xen-redhat/include/public/features.h
@@ -56,6 +56,9 @@
  */
 #define XENFEAT_pae_pgdir_above_4gb        4
 
+/* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
+#define XENFEAT_mmu_pt_update_preserve_ad  5
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
diff -Naurp xen/include/public/grant_table.h xen-redhat/include/public/grant_table.h
--- xen/include/public/grant_table.h
+++ xen-redhat/include/public/grant_table.h
@@ -370,7 +370,8 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_query_siz
 #define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
 #define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
 #define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
-#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
+#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary.   */
+#define GNTST_address_too_big (-11) /* transfer page address too large.      */
 
 #define GNTTABOP_error_msgs {                   \
     "okay",                                     \
@@ -383,7 +384,8 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_query_siz
     "no spare translation slot in the I/O MMU", \
     "permission denied",                        \
     "bad page",                                 \
-    "copy arguments cross page boundary"        \
+    "copy arguments cross page boundary",       \
+    "page address size too large"               \
 }
 
 #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff -Naurp xen/include/public/hvm/hvm_op.h xen-redhat/include/public/hvm/hvm_op.h
--- xen/include/public/hvm/hvm_op.h
+++ xen-redhat/include/public/hvm/hvm_op.h
@@ -73,4 +73,12 @@ DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_
 /* Flushes all VCPU TLBs: @arg must be NULL. */
 #define HVMOP_flush_tlbs          5
 
+/* Get the current Xen time, in nanoseconds since system boot. */
+#define HVMOP_get_time              10
+struct xen_hvm_get_time {
+    uint64_t now;      /* OUT */
+};
+typedef struct xen_hvm_get_time xen_hvm_get_time_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_time_t);
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff -Naurp xen/include/public/hvm/params.h xen-redhat/include/public/hvm/params.h
--- xen/include/public/hvm/params.h
+++ xen-redhat/include/public/hvm/params.h
@@ -52,9 +52,36 @@
 
 #ifdef __ia64__
 #define HVM_PARAM_NVRAM_FD     7
-#define HVM_NR_PARAMS          8
-#else
-#define HVM_NR_PARAMS          7
 #endif
 
+/*
+ * Set mode for virtual timers (currently x86 only):
+ *  delay_for_missed_ticks (default):
+ *   Do not advance a vcpu's time beyond the correct delivery time for
+ *   interrupts that have been missed due to preemption. Deliver missed
+ *   interrupts when the vcpu is rescheduled and advance the vcpu's virtual
+ *   time stepwise for each one.
+ *  no_delay_for_missed_ticks:
+ *   As above, missed interrupts are delivered, but guest time always tracks
+ *   wallclock (i.e., real) time while doing so.
+ *  no_missed_ticks_pending:
+ *   No missed interrupts are held pending. Instead, to ensure ticks are
+ *   delivered at some non-zero rate, if we detect missed ticks then the
+ *   internal tick alarm is not disabled if the VCPU is preempted during the
+ *   next tick period.
+ *  one_missed_tick_pending:
+ *   Missed interrupts are collapsed together and delivered as one 'late tick'.
+ *   Guest time always tracks wallclock (i.e., real) time.
+ */
+#define HVM_PARAM_TIMER_MODE   10
+#define HVMPTM_delay_for_missed_ticks    0
+#define HVMPTM_no_delay_for_missed_ticks 1
+#define HVMPTM_no_missed_ticks_pending   2
+#define HVMPTM_one_missed_tick_pending   3
+
+/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
+#define HVM_PARAM_HPET_ENABLED 11
+
+#define HVM_NR_PARAMS          12
+
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff -Naurp xen/include/public/hvm/save.h xen-redhat/include/public/hvm/save.h
--- xen/include/public/hvm/save.h
+++ xen-redhat/include/public/hvm/save.h
@@ -328,7 +328,7 @@ struct hvm_hw_pci_irqs {
      * Indexed by: device*4 + INTx#.
      */
     union {
-        DECLARE_BITMAP(i, 32*4);
+        unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */
         uint64_t pad[2];
     };
 };
@@ -341,7 +341,7 @@ struct hvm_hw_isa_irqs {
      * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing).
      */
     union {
-        DECLARE_BITMAP(i, 16);
+        unsigned long i[1];  /* DECLARE_BITMAP(i, 16); */
         uint64_t pad[1];
     };
 };
diff -Naurp xen/include/public/kexec.h xen-redhat/include/public/kexec.h
--- xen/include/public/kexec.h
+++ xen-redhat/include/public/kexec.h
@@ -109,6 +109,7 @@ typedef struct xen_kexec_load {
 #define KEXEC_RANGE_MA_XEN   1   /* machine address and size of Xen itself */
 #define KEXEC_RANGE_MA_CPU   2   /* machine address and size of a CPU note */
 
+#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */
 /*
  * Find the address and size of certain memory areas
  * range == KEXEC_RANGE_... [in]
@@ -124,6 +125,27 @@ typedef struct xen_kexec_range {
     unsigned long start;
 } xen_kexec_range_t;
 
+/* vmcoreinfo stuff */
+#define VMCOREINFO_BYTES           (4096)
+#define VMCOREINFO_NOTE_NAME       "VMCOREINFO_XEN"
+void arch_crash_save_vmcoreinfo(void);
+void vmcoreinfo_append_str(const char *fmt, ...)
+       __attribute__ ((format (printf, 1, 2)));
+#define VMCOREINFO_PAGESIZE(value) \
+       vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+       vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \
+       vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name)
+#define VMCOREINFO_STRUCT_SIZE(name) \
+       vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+       vmcoreinfo_append_str("OFFSET(%s.%s)=%zu\n", #name, #field, \
+                             offsetof(struct name, field))
+#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \
+       vmcoreinfo_append_str("OFFSET(%s.%s)=%zu\n", #name, #alias, \
+                             offsetof(struct name, field))
+
 #endif /* _XEN_PUBLIC_KEXEC_H */
 
 /*
diff -Naurp xen/include/public/physdev.h xen-redhat/include/public/physdev.h
--- xen/include/public/physdev.h
+++ xen-redhat/include/public/physdev.h
@@ -21,6 +21,8 @@
 #ifndef __XEN_PUBLIC_PHYSDEV_H__
 #define __XEN_PUBLIC_PHYSDEV_H__
 
+#include "xen.h"
+
 /*
  * Prototype for this hypercall is:
  *  int physdev_op(int cmd, void *args)
@@ -117,7 +119,64 @@ struct physdev_irq {
 };
 typedef struct physdev_irq physdev_irq_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
+ 
+#define MAP_PIRQ_TYPE_MSI               0x0
+#define MAP_PIRQ_TYPE_GSI               0x1
+#define MAP_PIRQ_TYPE_UNKNOWN           0x2
+
+#define PHYSDEVOP_map_pirq               13
+struct physdev_map_pirq {
+    domid_t domid;
+    /* IN */
+    int type;
+    /* IN */
+    int index;
+    /* IN or OUT */
+    int pirq;
+    /* IN */
+    int bus;
+    /* IN */
+    int devfn;
+    /* IN */
+    int entry_nr;
+    /* IN */
+    uint64_t table_base;
+};
+typedef struct physdev_map_pirq physdev_map_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t);
+
+#define PHYSDEVOP_unmap_pirq             14
+struct physdev_unmap_pirq {
+	    domid_t domid;
+	        /* IN */
+	        int pirq;
+};
+
+typedef struct physdev_unmap_pirq physdev_unmap_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t);
+
+#define PHYSDEVOP_manage_pci_add         15
+#define PHYSDEVOP_manage_pci_remove      16
+struct physdev_manage_pci {
+    /* IN */
+    uint8_t bus;
+    uint8_t devfn;
+};
+
+typedef struct physdev_manage_pci physdev_manage_pci_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t);
+
+/* N.B. RHEL specific hypercall */
+#define PHYSDEVOP_set_device_msixtbl      1999
+struct physdev_device_msixtbl {
+    /* IN */
+    uint8_t bus;
+    uint8_t devfn;
+    uint64_t gtable;
+};
 
+typedef struct physdev_device_msixtbl physdev_device_msixtbl_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_device_msixtbl_t);
 /*
  * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
  * hypercall since 0x00030202.
diff -Naurp xen/include/public/platform.h xen-redhat/include/public/platform.h
--- xen/include/public/platform.h
+++ xen-redhat/include/public/platform.h
@@ -28,6 +28,7 @@
 #define __XEN_PUBLIC_PLATFORM_H__
 
 #include "xen.h"
+#include "stratus.h"
 
 #define XENPF_INTERFACE_VERSION 0x03000001
 
@@ -153,6 +154,45 @@ struct xenpf_firmware_info {
 typedef struct xenpf_firmware_info xenpf_firmware_info_t;
 DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t);
 
+#define XENPF_stratus_call	0xffffffff
+typedef struct xenpf_stratus_call xenpf_stratus_call_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_stratus_call_t);
+
+#define XENPF_change_freq         52
+struct xenpf_change_freq {
+    /* IN variables */
+    uint32_t flags; /* Must be zero. */
+    uint32_t cpu;   /* Physical cpu. */
+    uint64_t freq;  /* New frequency (Hz). */
+};
+typedef struct xenpf_change_freq xenpf_change_freq_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t);
+
+/*
+ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
+ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is
+ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
+ * bit set are written to. On return, @cpumap_bitmap is modified so that any
+ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
+ * cleared.
+ */
+#define XENPF_getidletime         53
+struct xenpf_getidletime {
+    /* IN/OUT variables */
+    /* IN: CPUs to interrogate; OUT: subset of IN which are present */
+    XEN_GUEST_HANDLE(uint8_t) cpumap_bitmap;
+    /* IN variables */
+    /* Size of cpumap bitmap. */
+    uint32_t cpumap_nr_cpus;
+    /* Must be indexable for every cpu in cpumap_bitmap. */
+    XEN_GUEST_HANDLE(uint64_t) idletime;
+    /* OUT variables */
+    /* System time when the idletime snapshots were taken. */
+    uint64_t now;
+};
+typedef struct xenpf_getidletime xenpf_getidletime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+
 struct xen_platform_op {
     uint32_t cmd;
     uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -164,6 +204,9 @@ struct xen_platform_op {
         struct xenpf_microcode_update  microcode;
         struct xenpf_platform_quirk    platform_quirk;
         struct xenpf_firmware_info     firmware_info;
+	struct xenpf_change_freq       change_freq;
+	struct xenpf_getidletime       getidletime;
+	struct xenpf_stratus_call      stratus_call;
         uint8_t                        pad[128];
     } u;
 };
diff -Naurp xen/include/public/stratus.h xen-redhat/include/public/stratus.h
--- xen/include/public/stratus.h
+++ xen-redhat/include/public/stratus.h
@@ -0,0 +1,76 @@
+#ifndef _CC_INTERFACE_H
+#define _CC_INTERFACE_H
+
+// Clear the entire Host BIOS vector
+#define CC_HBV_MEMSET 			1	
+// Read/Write from page 0 (HBV or DUMP)
+#define CC_RW_REGION			2
+// Trigger SMI through local apic
+#define CC_TRIGGER_SMI			3
+// Return local cpu apic id
+#define CC_LAPIC_ID			4
+// Get/Set CR4.
+#define CC_CR4				5
+// Get cpuid
+#define CC_CPUID			6
+// Read/Write MSRs
+#define CC_RW_MSR			7
+// Are we on a Stratus box?
+#define CC_VALIDATE_PLATFORM		8
+
+// Page 0 regions to read/write (host bios vector or dump vector signature).
+#define	RW_HBV		1
+#define	RW_DUMPVEC	2
+
+struct cr4_struct {
+	int rw;		// 0 = read, 1 = write.
+	unsigned long cr4;
+};
+
+struct cpuid_struct {
+	unsigned int op;
+	unsigned int eax, ebx, ecx, edx;	
+};
+
+struct msr_struct {
+	int rw;
+	unsigned int msr;
+	unsigned long val;
+};
+
+struct lapic_struct {
+	int id;
+};
+
+struct rw_struct {
+	int rw;			// 0 = read, 1 = write
+	int region;		// RW_HBV or RW_CONTIG
+	void *data;
+	unsigned long where;	// offset in region
+	int size;
+};
+
+struct smi_struct {
+	unsigned int dest;
+};
+
+struct hbv_memset_struct {
+	int val;
+	int size;
+};
+
+struct xenpf_stratus_call {
+	int cmd;
+	int ret;
+	union {
+		struct smi_struct smi;
+		struct hbv_memset_struct hbv_m;
+		struct rw_struct rw;
+		struct lapic_struct ls;
+		struct cr4_struct cr4;
+		struct cpuid_struct cpuid;
+		struct msr_struct msr;
+	} u;
+};
+
+#endif
diff -Naurp xen/include/public/sysctl.h xen-redhat/include/public/sysctl.h
--- xen/include/public/sysctl.h
+++ xen-redhat/include/public/sysctl.h
@@ -76,6 +76,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_
  */
 #define XEN_SYSCTL_physinfo          3
 struct xen_sysctl_physinfo {
+    /* IN variables. */
     uint32_t threads_per_core;
     uint32_t cores_per_socket;
     uint32_t sockets_per_node;
@@ -85,6 +86,23 @@ struct xen_sysctl_physinfo {
     uint64_aligned_t free_pages;
     uint64_aligned_t scrub_pages;
     uint32_t hw_cap[8];
+
+    /* IN/OUT variables. */
+    /*
+     * IN: maximum addressable entry in the caller-provided cpu_to_node array.
+     * OUT: largest cpu identifier in the system.
+     * If OUT is greater than IN then the cpu_to_node array is truncated!
+     */
+    uint32_t max_cpu_id;
+    /*
+     * If not NULL, this array is filled with node identifier for each cpu.
+     * If a cpu has no node information (e.g., cpu not present) then the
+     * sentinel value ~0u is written.
+     * The size of this array is specified by the caller in @max_cpu_id.
+     * If the actual @max_cpu_id is smaller than the array then the trailing
+     * elements of the array will not be written by the sysctl.
+     */
+    XEN_GUEST_HANDLE_64(uint32_t) cpu_to_node;
 };
 typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
@@ -167,6 +185,18 @@ struct xen_sysctl_getcpuinfo {
 typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); 
 
+#define XEN_SYSCTL_availheap         9
+struct xen_sysctl_availheap {
+    /* IN variables. */
+    uint32_t min_bitwidth;  /* Smallest address width (zero if don't care). */
+    uint32_t max_bitwidth;  /* Largest address width (zero if don't care). */
+    int32_t  node;          /* NUMA node of interest (-1 for all nodes). */
+    /* OUT variables. */
+    uint64_t avail_bytes;   /* Bytes available in the specified region. */
+};
+typedef struct xen_sysctl_availheap xen_sysctl_availheap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t);
+
 struct xen_sysctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -178,6 +208,7 @@ struct xen_sysctl {
         struct xen_sysctl_perfc_op          perfc_op;
         struct xen_sysctl_getdomaininfolist getdomaininfolist;
         struct xen_sysctl_debug_keys        debug_keys;
+        struct xen_sysctl_availheap         availheap;
         struct xen_sysctl_getcpuinfo        getcpuinfo;
         uint8_t                             pad[128];
     } u;
diff -Naurp xen/include/public/vcpu.h xen-redhat/include/public/vcpu.h
--- xen/include/public/vcpu.h
+++ xen-redhat/include/public/vcpu.h
@@ -170,7 +170,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_set_singles
  *
  * This may be called only once per vcpu.
  */
-#define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
+#define VCPUOP_register_vcpu_info   10  /* arg == vcpu_register_vcpu_info_t */
 struct vcpu_register_vcpu_info {
     uint64_t mfn;    /* mfn of page to place vcpu_info */
     uint32_t offset; /* offset within page */
@@ -179,6 +179,22 @@ struct vcpu_register_vcpu_info {
 typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
 
+/* 
+ * Get the physical ID information for a pinned vcpu's underlying physical
+ * processor.  The physical ID informmation is architecture-specific.
+ * On x86: id[7:0]=apic_id, id[15:8]=acpi_id, id[63:16]=mbz,
+ *         and an unavailable identifier is returned as 0xff.
+ * This command returns -EINVAL if it is not a valid operation for this VCPU.
+ */
+#define VCPUOP_get_physid           12 /* arg == vcpu_get_physid_t */
+struct vcpu_get_physid {
+    uint64_t phys_id;
+};
+typedef struct vcpu_get_physid vcpu_get_physid_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t);
+#define xen_vcpu_physid_to_x86_apicid(physid) ((uint8_t)((physid)>>0))
+#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint8_t)((physid)>>8))
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 
 /*
diff -Naurp xen/include/public/xen.h xen-redhat/include/public/xen.h
--- xen/include/public/xen.h
+++ xen-redhat/include/public/xen.h
@@ -168,9 +168,13 @@
  * ptr[:2]  -- Machine address within the frame whose mapping to modify.
  *             The frame must belong to the FD, if one is specified.
  * val      -- Value to write into the mapping entry.
+ * 
+ * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
+ * As MMU_NORMAL_PT_UPDATE above, but A/D bits in the PTE are preserved (ORed).
  */
-#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
-#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
+#define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.      */
+#define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for */
+#define MMU_PT_UPDATE_PRESERVE_AD 2 /* '*ptr = val', preserve (OR) A/D bits  */
 
 /*
  * MMU EXTENDED OPERATIONS
diff -Naurp xen/include/xen/acpi.h xen-redhat/include/xen/acpi.h
--- xen/include/xen/acpi.h
+++ xen-redhat/include/xen/acpi.h
@@ -367,15 +367,88 @@ enum acpi_table_id {
 	ACPI_SPMI,
 	ACPI_HPET,
 	ACPI_MCFG,
+	ACPI_DMAR,
+	ACPI_IVRS,
 	ACPI_TABLE_COUNT
 };
 
+/* DMA Remapping Reporting Table (DMAR) */
+
+#define DMAR_FLAGS_INTR_REMAP 0x1       /* intr remap supported */
+struct acpi_table_dmar {
+	struct acpi_table_header	header;
+	u8				haw;	/* Host address Width */
+	u8				flags;
+	u8				reserved[10];
+} __attribute__ ((packed));
+
+struct acpi_dmar_entry_header {
+	u16	type;
+	u16	length;
+} __attribute__((packed));
+
+enum acpi_dmar_entry_type {
+	ACPI_DMAR_DRHD = 0,
+	ACPI_DMAR_RMRR,
+	ACPI_DMAR_ATSR,
+	ACPI_DMAR_ENTRY_COUNT
+};
+
+#define DRHD_FLAGS_INCLUDE_ALL	0x1       /* drhd remaps remaining devices */
+struct acpi_table_drhd {
+	struct	acpi_dmar_entry_header header;
+	u8	flags;
+	u8	reserved;
+	u16	segment;
+	u64	address; /* register base address for this drhd */
+} __attribute__ ((packed));
+
+struct acpi_table_rmrr {
+	struct	acpi_dmar_entry_header header;
+	u16	reserved;
+       u16     segment;
+	u64	base_address;
+	u64	end_address;
+} __attribute__ ((packed));
+
+struct acpi_table_atsr {
+        struct  acpi_dmar_entry_header header;
+        u8      flags;
+        u8      reserved;
+        u16     segment;
+} __attribute__ ((packed));
+
+enum acpi_dev_scope_type {
+	ACPI_DEV_ENDPOINT=0x01,	/* PCI Endpoing device */
+	ACPI_DEV_P2PBRIDGE,	/* PCI-PCI Bridge */
+	ACPI_DEV_IOAPIC,	/* IOAPIC device*/
+	ACPI_DEV_MSI_HPET,	/* MSI capable HPET*/
+	ACPI_DEV_ENTRY_COUNT
+};
+
+struct acpi_dev_scope {
+	u8	dev_type;
+	u8	length;
+	u8	reserved[2];
+	u8	enum_id;
+	u8	start_bus;
+} __attribute__((packed));
+
+struct acpi_pci_path {
+	u8	dev;
+	u8	fn;
+} __attribute__((packed));
+
+typedef int (*acpi_dmar_entry_handler) (struct acpi_dmar_entry_header *header, const unsigned long end);
+
+
 typedef int (*acpi_table_handler) (unsigned long phys_addr, unsigned long size);
 
 extern acpi_table_handler acpi_table_ops[ACPI_TABLE_COUNT];
 
 typedef int (*acpi_madt_entry_handler) (acpi_table_entry_header *header, const unsigned long end);
 
+unsigned int acpi_get_processor_id (unsigned int cpu);
 char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
 unsigned long acpi_find_rsdp (void);
 int acpi_boot_init (void);
diff -Naurp xen/include/xen/compat.h xen-redhat/include/xen/compat.h
--- xen/include/xen/compat.h
+++ xen-redhat/include/xen/compat.h
@@ -176,15 +176,10 @@ void xlat_vcpu_runstate_info(struct vcpu
 int switch_compat(struct domain *);
 int switch_native(struct domain *);
 
-#define BITS_PER_GUEST_LONG(d) \
-    (!IS_COMPAT(d) ? BITS_PER_LONG : COMPAT_BITS_PER_LONG)
-
 #else
 
 #define compat_handle_is_null(hnd) 0
 
-#define BITS_PER_GUEST_LONG(d) BITS_PER_LONG
-
 #endif
 
 #endif /* __XEN_COMPAT_H__ */
diff -Naurp xen/include/xen/console.h xen-redhat/include/xen/console.h
--- xen/include/xen/console.h
+++ xen-redhat/include/xen/console.h
@@ -26,9 +26,6 @@ void console_force_lock(void);
 void console_start_sync(void);
 void console_end_sync(void);
 
-void console_start_log_everything(void);
-void console_end_log_everything(void);
-
 /*
  * Steal output from the console. Returns +ve identifier, else -ve error.
  * Takes the handle of the serial line to steal, and steal callback function.
diff -Naurp xen/include/xen/cpumask.h xen-redhat/include/xen/cpumask.h
--- xen/include/xen/cpumask.h
+++ xen-redhat/include/xen/cpumask.h
@@ -222,6 +222,15 @@ static inline int __next_cpu(int n, cons
 	return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1));
 }
 
+#define last_cpu(src) __last_cpu(&(src), NR_CPUS)
+static inline int __last_cpu(const cpumask_t *srcp, int nbits)
+{
+       int cpu, pcpu = NR_CPUS;
+       for (cpu = first_cpu(*srcp); cpu < NR_CPUS; cpu = next_cpu(cpu, *srcp))
+               pcpu = cpu;
+       return pcpu;
+}
+
 #define cpumask_of_cpu(cpu)						\
 ({									\
 	typeof(_unused_cpumask_arg_) m;					\
diff -Naurp xen/include/xen/dmi.h xen-redhat/include/xen/dmi.h
--- xen/include/xen/dmi.h
+++ xen-redhat/include/xen/dmi.h
@@ -34,5 +34,7 @@ struct dmi_system_id {
 
 extern int dmi_check_system(struct dmi_system_id *list);
 extern char * dmi_get_system_info(int field);
+extern void dmi_scan_machine(void);
+extern int dmi_get_table(u32 *base, u32 *len);
 
 #endif	/* __DMI_H__ */
diff -Naurp xen/include/xen/domain.h xen-redhat/include/xen/domain.h
--- xen/include/xen/domain.h
+++ xen-redhat/include/xen/domain.h
@@ -45,7 +45,7 @@ void arch_domain_destroy(struct domain *
 int arch_set_info_guest(struct vcpu *, vcpu_guest_context_u);
 void arch_get_info_guest(struct vcpu *, vcpu_guest_context_u);
 
-void domain_relinquish_resources(struct domain *d);
+int domain_relinquish_resources(struct domain *d);
 
 void dump_pageframe_info(struct domain *d);
 
diff -Naurp xen/include/xen/elfcore.h xen-redhat/include/xen/elfcore.h
--- xen/include/xen/elfcore.h
+++ xen-redhat/include/xen/elfcore.h
@@ -66,6 +66,7 @@ typedef struct {
     unsigned long xen_compile_time;
     unsigned long tainted;
 #ifdef CONFIG_X86
+    unsigned long xen_phys_start;
     unsigned long dom0_pfn_to_mfn_frame_list_list;
 #endif
 } crash_xen_info_t;
diff -Naurp xen/include/xen/gdbstub.h xen-redhat/include/xen/gdbstub.h
--- xen/include/xen/gdbstub.h
+++ xen-redhat/include/xen/gdbstub.h
@@ -47,6 +47,7 @@ struct gdb_context {
     unsigned long       out_offset;
     u8                  out_csum;
 };
+extern struct gdb_context *gdb_ctx;
 
 /* interface to arch specific routines */
 void gdb_write_to_packet(
diff -Naurp xen/include/xen/hvm/iommu.h xen-redhat/include/xen/hvm/iommu.h
--- xen/include/xen/hvm/iommu.h
+++ xen-redhat/include/xen/hvm/iommu.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ */
+
+#ifndef __XEN_HVM_IOMMU_H__
+#define __XEN_HVM_IOMMU_H__
+
+#include <xen/iommu.h>
+
+struct g2m_ioport {
+    struct list_head list;
+    unsigned int gport;
+    unsigned int mport;
+    unsigned int np;
+};
+
+struct hvm_iommu {
+    u64 pgd_maddr;                 /* io page directory machine address */
+    spinlock_t mapping_lock;       /* io page table lock */
+    int agaw;     /* adjusted guest address width, 0 is level 2 30-bit */
+    struct list_head g2m_ioport_list;  /* guest to machine ioport mapping */
+    domid_t iommu_domid;           /* domain id stored in iommu */
+    u64 iommu_bitmap;              /* bitmap of iommu(s) that the domain uses */
+
+    /* amd iommu support */
+    int domain_id;
+    int paging_mode;
+    struct page_info *root_table;
+    bool_t p2m_synchronized;
+
+    /* iommu_ops */
+    struct iommu_ops *platform_ops;
+};
+
+#endif /* __XEN_HVM_IOMMU_H__ */
diff -Naurp xen/include/xen/hvm/irq.h xen-redhat/include/xen/hvm/irq.h
--- xen/include/xen/hvm/irq.h
+++ xen-redhat/include/xen/hvm/irq.h
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * irq.h
+ * 
+ * Interrupt distribution and delivery logic.
+ * 
+ * Copyright (c) 2006, K A Fraser, XenSource Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#ifndef __XEN_HVM_IRQ_H__
+#define __XEN_HVM_IRQ_H__
+
+#include <xen/types.h>
+#include <xen/spinlock.h>
+#include <asm/irq.h>
+#include <public/hvm/save.h>
+
+struct dev_intx_gsi_link {
+    struct list_head list;
+    uint8_t device;
+    uint8_t intx;
+    uint8_t gsi;
+    uint8_t link;
+};
+
+#define _HVM_IRQ_DPCI_MSI  0x1
+
+struct hvm_gmsi_info {
+    uint32_t gvec;
+    uint32_t gflags;
+};
+
+struct hvm_mirq_dpci_mapping {
+    uint32_t flags;
+    int pending;
+    struct list_head digl_list;
+    struct domain *dom;
+    struct hvm_gmsi_info gmsi;
+};
+
+struct hvm_girq_dpci_mapping {
+    uint8_t valid;
+    uint8_t device;
+    uint8_t intx;
+    uint8_t machine_gsi;
+};
+
+#define NR_ISAIRQS  16
+#define NR_LINK     4
+
+/* Protected by domain's event_lock */
+struct hvm_irq_dpci {
+    /* Machine IRQ to guest device/intx mapping. */
+    DECLARE_BITMAP(mapping, NR_IRQS);
+    struct hvm_mirq_dpci_mapping mirq[NR_IRQS];
+    /* Guest IRQ to guest device/intx mapping. */
+    struct hvm_girq_dpci_mapping girq[NR_IRQS];
+    uint8_t msi_gvec_pirq[NR_VECTORS];
+    DECLARE_BITMAP(dirq_mask, NR_IRQS);
+    /* Record of mapped ISA IRQs */
+    DECLARE_BITMAP(isairq_map, NR_ISAIRQS);
+    /* Record of mapped Links */
+    uint8_t link_cnt[NR_LINK];
+    struct timer hvm_timer[NR_IRQS];
+};
+
+/* Modify state of a PCI INTx wire. */
+void hvm_pci_intx_assert(
+    struct domain *d, unsigned int device, unsigned int intx);
+void hvm_pci_intx_deassert(
+    struct domain *d, unsigned int device, unsigned int intx);
+
+/* Modify state of an ISA device's IRQ wire. */
+void hvm_isa_irq_assert(
+    struct domain *d, unsigned int isa_irq);
+void hvm_isa_irq_deassert(
+    struct domain *d, unsigned int isa_irq);
+
+void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
+
+void hvm_maybe_deassert_evtchn_irq(void);
+void hvm_assert_evtchn_irq(struct vcpu *v);
+void hvm_set_callback_via(struct domain *d, uint64_t via);
+
+void hvm_dirq_assist(struct vcpu *v);
+
+#endif /* __XEN_HVM_IRQ_H__ */
diff -Naurp xen/include/xen/iommu.h xen-redhat/include/xen/iommu.h
--- xen/include/xen/iommu.h
+++ xen-redhat/include/xen/iommu.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ */
+
+#ifndef _IOMMU_H_
+#define _IOMMU_H_
+
+#include <xen/init.h>
+#include <xen/spinlock.h>
+#include <xen/pci.h>
+#include <public/hvm/ioreq.h>
+#include <public/domctl.h>
+
+extern int vtd_enabled;
+extern int iommu_enabled;
+extern int iommu_pv_enabled;
+extern int force_iommu;
+extern int iommu_passthrough;
+extern int iommu_snoop;
+extern int iommu_intremap;
+
+#define domain_hvm_iommu(d)     (&d->arch.hvm_domain.hvm_iommu)
+
+#define MAX_IOMMUS 32
+
+#define PAGE_SHIFT_4K       (12)
+#define PAGE_SIZE_4K        (1UL << PAGE_SHIFT_4K)
+#define PAGE_MASK_4K        (((u64)-1) << PAGE_SHIFT_4K)
+#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
+
+struct iommu {
+    struct list_head list;
+    void __iomem *reg; /* Pointer to hardware regs, virtual addr */
+    u32	index;         /* Sequence number of iommu */
+    u32	gcmd;          /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
+    u32 nr_pt_levels;
+    u64	cap;
+    u64	ecap;
+    spinlock_t lock; /* protect context, domain ids */
+    spinlock_t register_lock; /* protect iommu register handling */
+    u64 root_maddr; /* root entry machine address */
+    unsigned int vector;
+    struct intel_iommu *intel;
+};
+
+int iommu_setup(void);
+
+int iommu_add_device(struct pci_dev *pdev);
+int iommu_remove_device(struct pci_dev *pdev);
+int iommu_domain_init(struct domain *d);
+void iommu_domain_destroy(struct domain *d);
+int device_assigned(u8 bus, u8 devfn);
+int assign_device(struct domain *d, u8 bus, u8 devfn);
+int deassign_device(struct domain *d, u8 bus, u8 devfn);
+int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn,
+    XEN_GUEST_HANDLE_64(uint32_t) buf, int max_sdevs);
+int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn);
+int iommu_unmap_page(struct domain *d, unsigned long gfn);
+void iommu_domain_teardown(struct domain *d);
+int hvm_do_IRQ_dpci(struct domain *d, unsigned int irq);
+int dpci_ioport_intercept(ioreq_t *p);
+int pt_irq_create_bind_vtd(struct domain *d,
+                           xen_domctl_bind_pt_irq_t *pt_irq_bind);
+int pt_irq_destroy_bind_vtd(struct domain *d,
+                            xen_domctl_bind_pt_irq_t *pt_irq_bind);
+unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg);
+void io_apic_write_remap_rte(unsigned int apic,
+                             unsigned int reg, unsigned int value);
+
+struct msi_desc;
+struct msi_msg;
+void msi_msg_read_remap_rte(struct msi_desc *msi_desc, struct msi_msg *msg);
+void msi_msg_write_remap_rte(struct msi_desc *msi_desc, struct msi_msg *msg);
+struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu);
+struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu);
+struct iommu_flush *iommu_get_flush(struct iommu *iommu);
+void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq);
+struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain);
+int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci);
+
+#define PT_IRQ_TIME_OUT MILLISECS(8)
+#define VTDPREFIX "[VT-D]"
+
+struct iommu_ops {
+    int (*init)(struct domain *d);
+    int (*add_device)(struct pci_dev *pdev);
+    int (*remove_device)(struct pci_dev *pdev);
+    int (*assign_device)(struct domain *d, u8 bus, u8 devfn);
+    void (*teardown)(struct domain *d);
+    int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn);
+    int (*unmap_page)(struct domain *d, unsigned long gfn);
+    int (*reassign_device)(struct domain *s, struct domain *t,
+			   u8 bus, u8 devfn);
+    int (*get_device_group_id)(u8 bus, u8 devfn);
+    void (*update_ire_from_apic)(unsigned int apic, unsigned int reg, unsigned int value);
+    void (*update_ire_from_msi)(struct msi_desc *msi_desc, struct msi_msg *msg);
+};
+
+void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value);
+void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg);
+
+void iommu_suspend(void);
+void iommu_resume(void);
+
+#endif /* _IOMMU_H_ */
diff -Naurp xen/include/xen/irq.h xen-redhat/include/xen/irq.h
--- xen/include/xen/irq.h
+++ xen-redhat/include/xen/irq.h
@@ -4,6 +4,7 @@
 #include <xen/config.h>
 #include <xen/cpumask.h>
 #include <xen/spinlock.h>
+#include <xen/time.h>
 #include <asm/regs.h>
 #include <asm/hardirq.h>
 
@@ -44,6 +45,7 @@ typedef struct hw_interrupt_type hw_irq_
 
 #include <asm/irq.h>
 
+struct msi_desc;
 /*
  * This is the "IRQ descriptor", which contains various information
  * about the irq, including what kind of hardware handling it has,
@@ -54,15 +56,27 @@ typedef struct hw_interrupt_type hw_irq_
 typedef struct {
     unsigned int status;		/* IRQ status */
     hw_irq_controller *handler;
+    struct msi_desc   *msi_desc;
     struct irqaction *action;	/* IRQ action list */
     unsigned int depth;		/* nested irq disables */
+    int vector;
     spinlock_t lock;
+    cpumask_t affinity;
+
+    /* irq ratelimit */
+    s_time_t rl_quantum_start;
+    unsigned int rl_cnt;
+    struct list_head rl_link;
 } __cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc[NR_IRQS];
 
 extern int setup_irq(unsigned int, struct irqaction *);
 extern void free_irq(unsigned int);
+extern int request_irq(unsigned int irq,
+                       void (*handler)(int, void *, struct cpu_user_regs *),
+                       unsigned long irqflags, const char * devname,
+                       void *dev_id);
 
 extern hw_irq_controller no_irq_type;
 extern void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs);
@@ -72,6 +86,20 @@ struct vcpu;
 extern int pirq_guest_eoi(struct domain *d, int irq);
 extern int pirq_guest_unmask(struct domain *d);
 extern int pirq_guest_bind(struct vcpu *v, int irq, int will_share);
-extern int pirq_guest_unbind(struct domain *d, int irq);
+extern void pirq_guest_unbind(struct domain *d, int irq);
+extern irq_desc_t *domain_spin_lock_irq_desc(
+		    struct domain *d, int irq, unsigned long *pflags);
+
+static inline void set_native_irq_info(unsigned int vector, cpumask_t mask)
+{
+	    irq_desc[vector].affinity = mask;
+}
+
+#ifdef irq_to_vector
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	    set_native_irq_info(irq_to_vector(irq), mask);
+}
+#endif
 
 #endif /* __XEN_IRQ_H__ */
diff -Naurp xen/include/xen/mm.h xen-redhat/include/xen/mm.h
--- xen/include/xen/mm.h
+++ xen-redhat/include/xen/mm.h
@@ -61,6 +61,8 @@ struct page_info *__alloc_domheap_pages(
     struct domain *d, unsigned int cpu, unsigned int order, 
     unsigned int memflags);
 void free_domheap_pages(struct page_info *pg, unsigned int order);
+unsigned long avail_domheap_pages_region(
+    unsigned int node, unsigned int min_width, unsigned int max_width);
 unsigned long avail_domheap_pages(void);
 #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
@@ -85,19 +87,7 @@ int assign_pages(
 #define MAX_ORDER 20 /* 2^20 contiguous pages */
 #endif
 
-/* Automatic page scrubbing for dead domains. */
-extern struct list_head page_scrub_list;
-#define page_scrub_schedule_work()              \
-    do {                                        \
-        if ( !list_empty(&page_scrub_list) )    \
-            raise_softirq(PAGE_SCRUB_SOFTIRQ);  \
-    } while ( 0 )
-#define page_scrub_kick()                                               \
-    do {                                                                \
-        if ( !list_empty(&page_scrub_list) )                            \
-            cpumask_raise_softirq(cpu_online_map, PAGE_SCRUB_SOFTIRQ);  \
-    } while ( 0 )
-unsigned long avail_scrub_pages(void);
+void scrub_one_page(struct page_info *);
 
 #include <asm/mm.h>
 
@@ -106,4 +96,6 @@ int guest_remove_page(struct domain *d, 
 /* Returns TRUE if the memory at address @p is ordinary RAM. */
 int memory_is_conventional_ram(paddr_t p);
 
+extern unsigned long *alloc_bitmap;	/* for vmcoreinfo */
+
 #endif /* __XEN_MM_H__ */
diff -Naurp xen/include/xen/paging.h xen-redhat/include/xen/paging.h
--- xen/include/xen/paging.h
+++ xen-redhat/include/xen/paging.h
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       ((void)0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, order)       (0)
+#define guest_physmap_remove_page(d, p, m, order)    ((void)0)
 
 #endif
 
diff -Naurp xen/include/xen/pci.h xen-redhat/include/xen/pci.h
--- xen/include/xen/pci.h
+++ xen-redhat/include/xen/pci.h
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * pci.h
+ * 
+ * PCI access functions.
+ */
+
+#ifndef __XEN_PCI_H__
+#define __XEN_PCI_H__
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/list.h>
+#include <xen/spinlock.h>
+
+/*
+ * The PCI interface treats multi-function devices as independent
+ * devices.  The slot/function address of each device is encoded
+ * in a single byte as follows:
+ *
+ * 15:8 = bus
+ *  7:3 = slot
+ *  2:0 = function
+ */
+#define PCI_BUS(bdf)    (((bdf) >> 8) & 0xff)
+#define PCI_SLOT(bdf)   (((bdf) >> 3) & 0x1f)
+#define PCI_FUNC(bdf)   ((bdf) & 0x07)
+#define PCI_DEVFN(d,f)  ((((d) & 0x1f) << 3) | ((f) & 0x07))
+#define PCI_DEVFN2(bdf) ((bdf) & 0xff)
+#define PCI_BDF(b,d,f)  ((((b) & 0xff) << 8) | PCI_DEVFN(d,f))
+#define PCI_BDF2(b,df)  ((((b) & 0xff) << 8) | ((df) & 0xff))
+
+#define MAX_MSIX_TABLE_ENTRIES  2048
+#define MAX_MSIX_TABLE_PAGES    8
+struct pci_dev {
+    struct list_head alldevs_list;
+    struct list_head domain_list;
+
+    struct list_head msi_list;
+    int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
+    int msix_table_idx[MAX_MSIX_TABLE_PAGES];
+    spinlock_t msix_table_lock;
+    u64 msix_table;
+
+    struct domain *domain;
+    const u8 bus;
+    const u8 devfn;
+};
+
+#define for_each_pdev(domain, pdev) \
+    list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list)
+
+/*
+ * The pcidevs_lock protect alldevs_list, and the assignment for the
+ * devices, it also sync the access to the msi capability that is not
+ * interrupt handling related (the mask bit register).
+ */
+
+extern spinlock_t pcidevs_lock;
+
+struct pci_dev *alloc_pdev(u8 bus, u8 devfn);
+void free_pdev(struct pci_dev *pdev);
+struct pci_dev *pci_lock_pdev(int bus, int devfn);
+struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn);
+
+void pci_release_devices(struct domain *d);
+int pci_add_device(u8 bus, u8 devfn);
+int pci_remove_device(u8 bus, u8 devfn);
+struct pci_dev *pci_get_pdev(int bus, int devfn);
+struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn);
+
+uint8_t pci_conf_read8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg);
+uint16_t pci_conf_read16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg);
+uint32_t pci_conf_read32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg);
+void pci_conf_write8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint8_t data);
+void pci_conf_write16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint16_t data);
+void pci_conf_write32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint32_t data);
+int pci_find_cap_offset(u8 bus, u8 dev, u8 func, u8 cap);
+int pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap);
+
+int pci_set_device_msixtbl(u8 bus, u8 devfn, u64 gtable);
+int msixtbl_pt_register(struct domain *d, int pirq);
+void msixtbl_pt_unregister(struct domain *d, int pirq);
+
+#endif /* __XEN_PCI_H__ */
diff -Naurp xen/include/xen/pci_regs.h xen-redhat/include/xen/pci_regs.h
--- xen/include/xen/pci_regs.h
+++ xen-redhat/include/xen/pci_regs.h
@@ -0,0 +1,530 @@
+/*
+ *	pci_regs.h
+ *
+ *	PCI standard defines
+ *	Copyright 1994, Drew Eckhardt
+ *	Copyright 1997--1999 Martin Mares <mj@ucw.cz>
+ *
+ *	For more information, please consult the following manuals (look at
+ *	http://www.pcisig.com/ for how to get them):
+ *
+ *	PCI BIOS Specification
+ *	PCI Local Bus Specification
+ *	PCI to PCI Bridge Specification
+ *	PCI System Design Guide
+ *
+ * 	For hypertransport information, please consult the following manuals
+ * 	from http://www.hypertransport.org
+ *
+ *	The Hypertransport I/O Link Specification
+ */
+
+#ifndef LINUX_PCI_REGS_H
+#define LINUX_PCI_REGS_H
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+#define PCI_VENDOR_ID		0x00	/* 16 bits */
+#define PCI_DEVICE_ID		0x02	/* 16 bits */
+#define PCI_COMMAND		0x04	/* 16 bits */
+#define  PCI_COMMAND_IO		0x1	/* Enable response in I/O space */
+#define  PCI_COMMAND_MEMORY	0x2	/* Enable response in Memory space */
+#define  PCI_COMMAND_MASTER	0x4	/* Enable bus mastering */
+#define  PCI_COMMAND_SPECIAL	0x8	/* Enable response to special cycles */
+#define  PCI_COMMAND_INVALIDATE	0x10	/* Use memory write and invalidate */
+#define  PCI_COMMAND_VGA_PALETTE 0x20	/* Enable palette snooping */
+#define  PCI_COMMAND_PARITY	0x40	/* Enable parity checking */
+#define  PCI_COMMAND_WAIT 	0x80	/* Enable address/data stepping */
+#define  PCI_COMMAND_SERR	0x100	/* Enable SERR */
+#define  PCI_COMMAND_FAST_BACK	0x200	/* Enable back-to-back writes */
+#define  PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
+
+#define PCI_STATUS		0x06	/* 16 bits */
+#define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
+#define  PCI_STATUS_66MHZ	0x20	/* Support 66 Mhz PCI 2.1 bus */
+#define  PCI_STATUS_UDF		0x40	/* Support User Definable Features [obsolete] */
+#define  PCI_STATUS_FAST_BACK	0x80	/* Accept fast-back to back */
+#define  PCI_STATUS_PARITY	0x100	/* Detected parity error */
+#define  PCI_STATUS_DEVSEL_MASK	0x600	/* DEVSEL timing */
+#define  PCI_STATUS_DEVSEL_FAST		0x000
+#define  PCI_STATUS_DEVSEL_MEDIUM	0x200
+#define  PCI_STATUS_DEVSEL_SLOW		0x400
+#define  PCI_STATUS_SIG_TARGET_ABORT	0x800 /* Set on target abort */
+#define  PCI_STATUS_REC_TARGET_ABORT	0x1000 /* Master ack of " */
+#define  PCI_STATUS_REC_MASTER_ABORT	0x2000 /* Set on master abort */
+#define  PCI_STATUS_SIG_SYSTEM_ERROR	0x4000 /* Set when we drive SERR */
+#define  PCI_STATUS_DETECTED_PARITY	0x8000 /* Set on parity error */
+
+#define PCI_CLASS_REVISION	0x08	/* High 24 bits are class, low 8 revision */
+#define PCI_REVISION_ID		0x08	/* Revision ID */
+#define PCI_CLASS_PROG		0x09	/* Reg. Level Programming Interface */
+#define PCI_CLASS_DEVICE	0x0a	/* Device class */
+
+#define PCI_CACHE_LINE_SIZE	0x0c	/* 8 bits */
+#define PCI_LATENCY_TIMER	0x0d	/* 8 bits */
+#define PCI_HEADER_TYPE		0x0e	/* 8 bits */
+#define  PCI_HEADER_TYPE_NORMAL		0
+#define  PCI_HEADER_TYPE_BRIDGE		1
+#define  PCI_HEADER_TYPE_CARDBUS	2
+
+#define PCI_BIST		0x0f	/* 8 bits */
+#define  PCI_BIST_CODE_MASK	0x0f	/* Return result */
+#define  PCI_BIST_START		0x40	/* 1 to start BIST, 2 secs or less */
+#define  PCI_BIST_CAPABLE	0x80	/* 1 if BIST capable */
+
+/*
+ * Base addresses specify locations in memory or I/O space.
+ * Decoded size can be determined by writing a value of
+ * 0xffffffff to the register, and reading it back.  Only
+ * 1 bits are decoded.
+ */
+#define PCI_BASE_ADDRESS_0	0x10	/* 32 bits */
+#define PCI_BASE_ADDRESS_1	0x14	/* 32 bits [htype 0,1 only] */
+#define PCI_BASE_ADDRESS_2	0x18	/* 32 bits [htype 0 only] */
+#define PCI_BASE_ADDRESS_3	0x1c	/* 32 bits */
+#define PCI_BASE_ADDRESS_4	0x20	/* 32 bits */
+#define PCI_BASE_ADDRESS_5	0x24	/* 32 bits */
+#define  PCI_BASE_ADDRESS_SPACE		0x01	/* 0 = memory, 1 = I/O */
+#define  PCI_BASE_ADDRESS_SPACE_IO	0x01
+#define  PCI_BASE_ADDRESS_SPACE_MEMORY	0x00
+#define  PCI_BASE_ADDRESS_MEM_TYPE_MASK	0x06
+#define  PCI_BASE_ADDRESS_MEM_TYPE_32	0x00	/* 32 bit address */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_1M	0x02	/* Below 1M [obsolete] */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_64	0x04	/* 64 bit address */
+#define  PCI_BASE_ADDRESS_MEM_PREFETCH	0x08	/* prefetchable? */
+#define  PCI_BASE_ADDRESS_MEM_MASK	(~0x0fUL)
+#define  PCI_BASE_ADDRESS_IO_MASK	(~0x03UL)
+/* bit 1 is reserved if address_space = 1 */
+
+/* Header type 0 (normal devices) */
+#define PCI_CARDBUS_CIS		0x28
+#define PCI_SUBSYSTEM_VENDOR_ID	0x2c
+#define PCI_SUBSYSTEM_ID	0x2e
+#define PCI_ROM_ADDRESS		0x30	/* Bits 31..11 are address, 10..1 reserved */
+#define  PCI_ROM_ADDRESS_ENABLE	0x01
+#define PCI_ROM_ADDRESS_MASK	(~0x7ffUL)
+
+#define PCI_CAPABILITY_LIST	0x34	/* Offset of first capability list entry */
+
+/* 0x35-0x3b are reserved */
+#define PCI_INTERRUPT_LINE	0x3c	/* 8 bits */
+#define PCI_INTERRUPT_PIN	0x3d	/* 8 bits */
+#define PCI_MIN_GNT		0x3e	/* 8 bits */
+#define PCI_MAX_LAT		0x3f	/* 8 bits */
+
+/* Header type 1 (PCI-to-PCI bridges) */
+#define PCI_PRIMARY_BUS		0x18	/* Primary bus number */
+#define PCI_SECONDARY_BUS	0x19	/* Secondary bus number */
+#define PCI_SUBORDINATE_BUS	0x1a	/* Highest bus number behind the bridge */
+#define PCI_SEC_LATENCY_TIMER	0x1b	/* Latency timer for secondary interface */
+#define PCI_IO_BASE		0x1c	/* I/O range behind the bridge */
+#define PCI_IO_LIMIT		0x1d
+#define  PCI_IO_RANGE_TYPE_MASK	0x0fUL	/* I/O bridging type */
+#define  PCI_IO_RANGE_TYPE_16	0x00
+#define  PCI_IO_RANGE_TYPE_32	0x01
+#define  PCI_IO_RANGE_MASK	(~0x0fUL)
+#define PCI_SEC_STATUS		0x1e	/* Secondary status register, only bit 14 used */
+#define PCI_MEMORY_BASE		0x20	/* Memory range behind */
+#define PCI_MEMORY_LIMIT	0x22
+#define  PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL
+#define  PCI_MEMORY_RANGE_MASK	(~0x0fUL)
+#define PCI_PREF_MEMORY_BASE	0x24	/* Prefetchable memory range behind */
+#define PCI_PREF_MEMORY_LIMIT	0x26
+#define  PCI_PREF_RANGE_TYPE_MASK 0x0fUL
+#define  PCI_PREF_RANGE_TYPE_32	0x00
+#define  PCI_PREF_RANGE_TYPE_64	0x01
+#define  PCI_PREF_RANGE_MASK	(~0x0fUL)
+#define PCI_PREF_BASE_UPPER32	0x28	/* Upper half of prefetchable memory range */
+#define PCI_PREF_LIMIT_UPPER32	0x2c
+#define PCI_IO_BASE_UPPER16	0x30	/* Upper half of I/O addresses */
+#define PCI_IO_LIMIT_UPPER16	0x32
+/* 0x34 same as for htype 0 */
+/* 0x35-0x3b is reserved */
+#define PCI_ROM_ADDRESS1	0x38	/* Same as PCI_ROM_ADDRESS, but for htype 1 */
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_BRIDGE_CONTROL	0x3e
+#define  PCI_BRIDGE_CTL_PARITY	0x01	/* Enable parity detection on secondary interface */
+#define  PCI_BRIDGE_CTL_SERR	0x02	/* The same for SERR forwarding */
+#define  PCI_BRIDGE_CTL_ISA	0x04	/* Enable ISA mode */
+#define  PCI_BRIDGE_CTL_VGA	0x08	/* Forward VGA addresses */
+#define  PCI_BRIDGE_CTL_MASTER_ABORT	0x20  /* Report master aborts */
+#define  PCI_BRIDGE_CTL_BUS_RESET	0x40	/* Secondary bus reset */
+#define  PCI_BRIDGE_CTL_FAST_BACK	0x80	/* Fast Back2Back enabled on secondary interface */
+
+/* Header type 2 (CardBus bridges) */
+#define PCI_CB_CAPABILITY_LIST	0x14
+/* 0x15 reserved */
+#define PCI_CB_SEC_STATUS	0x16	/* Secondary status */
+#define PCI_CB_PRIMARY_BUS	0x18	/* PCI bus number */
+#define PCI_CB_CARD_BUS		0x19	/* CardBus bus number */
+#define PCI_CB_SUBORDINATE_BUS	0x1a	/* Subordinate bus number */
+#define PCI_CB_LATENCY_TIMER	0x1b	/* CardBus latency timer */
+#define PCI_CB_MEMORY_BASE_0	0x1c
+#define PCI_CB_MEMORY_LIMIT_0	0x20
+#define PCI_CB_MEMORY_BASE_1	0x24
+#define PCI_CB_MEMORY_LIMIT_1	0x28
+#define PCI_CB_IO_BASE_0	0x2c
+#define PCI_CB_IO_BASE_0_HI	0x2e
+#define PCI_CB_IO_LIMIT_0	0x30
+#define PCI_CB_IO_LIMIT_0_HI	0x32
+#define PCI_CB_IO_BASE_1	0x34
+#define PCI_CB_IO_BASE_1_HI	0x36
+#define PCI_CB_IO_LIMIT_1	0x38
+#define PCI_CB_IO_LIMIT_1_HI	0x3a
+#define  PCI_CB_IO_RANGE_MASK	(~0x03UL)
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_CB_BRIDGE_CONTROL	0x3e
+#define  PCI_CB_BRIDGE_CTL_PARITY	0x01	/* Similar to standard bridge control register */
+#define  PCI_CB_BRIDGE_CTL_SERR		0x02
+#define  PCI_CB_BRIDGE_CTL_ISA		0x04
+#define  PCI_CB_BRIDGE_CTL_VGA		0x08
+#define  PCI_CB_BRIDGE_CTL_MASTER_ABORT	0x20
+#define  PCI_CB_BRIDGE_CTL_CB_RESET	0x40	/* CardBus reset */
+#define  PCI_CB_BRIDGE_CTL_16BIT_INT	0x80	/* Enable interrupt for 16-bit cards */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100	/* Prefetch enable for both memory regions */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
+#define  PCI_CB_BRIDGE_CTL_POST_WRITES	0x400
+#define PCI_CB_SUBSYSTEM_VENDOR_ID	0x40
+#define PCI_CB_SUBSYSTEM_ID		0x42
+#define PCI_CB_LEGACY_MODE_BASE		0x44	/* 16-bit PC Card legacy mode base address (ExCa) */
+/* 0x48-0x7f reserved */
+
+/* Capability lists */
+
+#define PCI_CAP_LIST_ID		0	/* Capability ID */
+#define  PCI_CAP_ID_PM		0x01	/* Power Management */
+#define  PCI_CAP_ID_AGP		0x02	/* Accelerated Graphics Port */
+#define  PCI_CAP_ID_VPD		0x03	/* Vital Product Data */
+#define  PCI_CAP_ID_SLOTID	0x04	/* Slot Identification */
+#define  PCI_CAP_ID_MSI		0x05	/* Message Signalled Interrupts */
+#define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
+#define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
+#define  PCI_CAP_ID_HT		0x08	/* HyperTransport */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific */
+#define  PCI_CAP_ID_DBG		0x0A	/* Debug port */
+#define  PCI_CAP_ID_CCRC	0x0B	/* CompactPCI Central Resource Control */
+#define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
+#define  PCI_CAP_ID_SSVID	0x0D	/* Bridge subsystem vendor/device ID */
+#define  PCI_CAP_ID_AGP3	0x0E	/* AGP Target PCI-PCI bridge */
+#define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
+#define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
+#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
+#define PCI_CAP_FLAGS		2	/* Capability defined flags (16 bits) */
+#define PCI_CAP_SIZEOF		4
+
+/* Power Management Registers */
+
+#define PCI_PM_PMC		2	/* PM Capabilities Register */
+#define  PCI_PM_CAP_VER_MASK	0x0007	/* Version */
+#define  PCI_PM_CAP_PME_CLOCK	0x0008	/* PME clock required */
+#define  PCI_PM_CAP_RESERVED    0x0010  /* Reserved field */
+#define  PCI_PM_CAP_DSI		0x0020	/* Device specific initialization */
+#define  PCI_PM_CAP_AUX_POWER	0x01C0	/* Auxilliary power support mask */
+#define  PCI_PM_CAP_D1		0x0200	/* D1 power state support */
+#define  PCI_PM_CAP_D2		0x0400	/* D2 power state support */
+#define  PCI_PM_CAP_PME		0x0800	/* PME pin supported */
+#define  PCI_PM_CAP_PME_MASK	0xF800	/* PME Mask of all supported states */
+#define  PCI_PM_CAP_PME_D0	0x0800	/* PME# from D0 */
+#define  PCI_PM_CAP_PME_D1	0x1000	/* PME# from D1 */
+#define  PCI_PM_CAP_PME_D2	0x2000	/* PME# from D2 */
+#define  PCI_PM_CAP_PME_D3	0x4000	/* PME# from D3 (hot) */
+#define  PCI_PM_CAP_PME_D3cold	0x8000	/* PME# from D3 (cold) */
+#define PCI_PM_CTRL		4	/* PM control and status register */
+#define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
+#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0008	/* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
+#define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
+#define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
+#define  PCI_PM_CTRL_PME_STATUS	0x8000	/* PME pin status */
+#define PCI_PM_PPB_EXTENSIONS	6	/* PPB support extensions (??) */
+#define  PCI_PM_PPB_B2_B3	0x40	/* Stop clock when in D3hot (??) */
+#define  PCI_PM_BPCC_ENABLE	0x80	/* Bus power/clock control enable (??) */
+#define PCI_PM_DATA_REGISTER	7	/* (??) */
+#define PCI_PM_SIZEOF		8
+
+/* AGP registers */
+
+#define PCI_AGP_VERSION		2	/* BCD version number */
+#define PCI_AGP_RFU		3	/* Rest of capability flags */
+#define PCI_AGP_STATUS		4	/* Status register */
+#define  PCI_AGP_STATUS_RQ_MASK	0xff000000	/* Maximum number of requests - 1 */
+#define  PCI_AGP_STATUS_SBA	0x0200	/* Sideband addressing supported */
+#define  PCI_AGP_STATUS_64BIT	0x0020	/* 64-bit addressing supported */
+#define  PCI_AGP_STATUS_FW	0x0010	/* FW transfers supported */
+#define  PCI_AGP_STATUS_RATE4	0x0004	/* 4x transfer rate supported */
+#define  PCI_AGP_STATUS_RATE2	0x0002	/* 2x transfer rate supported */
+#define  PCI_AGP_STATUS_RATE1	0x0001	/* 1x transfer rate supported */
+#define PCI_AGP_COMMAND		8	/* Control register */
+#define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of requests */
+#define  PCI_AGP_COMMAND_SBA	0x0200	/* Sideband addressing enabled */
+#define  PCI_AGP_COMMAND_AGP	0x0100	/* Allow processing of AGP transactions */
+#define  PCI_AGP_COMMAND_64BIT	0x0020 	/* Allow processing of 64-bit addresses */
+#define  PCI_AGP_COMMAND_FW	0x0010 	/* Force FW transfers */
+#define  PCI_AGP_COMMAND_RATE4	0x0004	/* Use 4x rate */
+#define  PCI_AGP_COMMAND_RATE2	0x0002	/* Use 2x rate */
+#define  PCI_AGP_COMMAND_RATE1	0x0001	/* Use 1x rate */
+#define PCI_AGP_SIZEOF		12
+
+/* Vital Product Data */
+
+#define PCI_VPD_ADDR		2	/* Address to access (15 bits!) */
+#define  PCI_VPD_ADDR_MASK	0x7fff	/* Address mask */
+#define  PCI_VPD_ADDR_F		0x8000	/* Write 0, 1 indicates completion */
+#define PCI_VPD_DATA		4	/* 32-bits of data returned here */
+
+/* Slot Identification */
+
+#define PCI_SID_ESR		2	/* Expansion Slot Register */
+#define  PCI_SID_ESR_NSLOTS	0x1f	/* Number of expansion slots available */
+#define  PCI_SID_ESR_FIC	0x20	/* First In Chassis Flag */
+#define PCI_SID_CHASSIS_NR	3	/* Chassis Number */
+
+/* Message Signalled Interrupts registers */
+
+#define PCI_MSI_FLAGS		2	/* Various flags */
+#define  PCI_MSI_FLAGS_64BIT	0x80	/* 64-bit addresses allowed */
+#define  PCI_MSI_FLAGS_QSIZE	0x70	/* Message queue size configured */
+#define  PCI_MSI_FLAGS_QMASK	0x0e	/* Maximum queue size available */
+#define  PCI_MSI_FLAGS_ENABLE	0x01	/* MSI feature enabled */
+#define  PCI_MSI_FLAGS_MASKBIT	0x100	/* 64-bit mask bits allowed */
+#define PCI_MSI_RFU		3	/* Rest of capability flags */
+#define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
+#define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
+#define PCI_MSI_MASK_BIT	16	/* Mask bits register */
+
+/* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */
+#define PCI_MSIX_FLAGS		2
+#define  PCI_MSIX_FLAGS_QSIZE	0x7FF
+#define  PCI_MSIX_FLAGS_ENABLE	(1 << 15)
+#define  PCI_MSIX_FLAGS_MASKALL	(1 << 14)
+#define PCI_MSIX_FLAGS_BIRMASK	(7 << 0)
+#define PCI_MSIX_FLAGS_BITMASK	(1 << 0)
+
+/* CompactPCI Hotswap Register */
+
+#define PCI_CHSWP_CSR		2	/* Control and Status Register */
+#define  PCI_CHSWP_DHA		0x01	/* Device Hiding Arm */
+#define  PCI_CHSWP_EIM		0x02	/* ENUM# Signal Mask */
+#define  PCI_CHSWP_PIE		0x04	/* Pending Insert or Extract */
+#define  PCI_CHSWP_LOO		0x08	/* LED On / Off */
+#define  PCI_CHSWP_PI		0x30	/* Programming Interface */
+#define  PCI_CHSWP_EXT		0x40	/* ENUM# status - extraction */
+#define  PCI_CHSWP_INS		0x80	/* ENUM# status - insertion */
+
+/* PCI-X registers */
+
+#define PCI_X_CMD		2	/* Modes & Features */
+#define  PCI_X_CMD_DPERR_E	0x0001	/* Data Parity Error Recovery Enable */
+#define  PCI_X_CMD_ERO		0x0002	/* Enable Relaxed Ordering */
+#define  PCI_X_CMD_READ_512	0x0000	/* 512 byte maximum read byte count */
+#define  PCI_X_CMD_READ_1K	0x0004	/* 1Kbyte maximum read byte count */
+#define  PCI_X_CMD_READ_2K	0x0008	/* 2Kbyte maximum read byte count */
+#define  PCI_X_CMD_READ_4K	0x000c	/* 4Kbyte maximum read byte count */
+#define  PCI_X_CMD_MAX_READ	0x000c	/* Max Memory Read Byte Count */
+				/* Max # of outstanding split transactions */
+#define  PCI_X_CMD_SPLIT_1	0x0000	/* Max 1 */
+#define  PCI_X_CMD_SPLIT_2	0x0010	/* Max 2 */
+#define  PCI_X_CMD_SPLIT_3	0x0020	/* Max 3 */
+#define  PCI_X_CMD_SPLIT_4	0x0030	/* Max 4 */
+#define  PCI_X_CMD_SPLIT_8	0x0040	/* Max 8 */
+#define  PCI_X_CMD_SPLIT_12	0x0050	/* Max 12 */
+#define  PCI_X_CMD_SPLIT_16	0x0060	/* Max 16 */
+#define  PCI_X_CMD_SPLIT_32	0x0070	/* Max 32 */
+#define  PCI_X_CMD_MAX_SPLIT	0x0070	/* Max Outstanding Split Transactions */
+#define  PCI_X_CMD_VERSION(x) 	(((x) >> 12) & 3) /* Version */
+#define PCI_X_STATUS		4	/* PCI-X capabilities */
+#define  PCI_X_STATUS_DEVFN	0x000000ff	/* A copy of devfn */
+#define  PCI_X_STATUS_BUS	0x0000ff00	/* A copy of bus nr */
+#define  PCI_X_STATUS_64BIT	0x00010000	/* 64-bit device */
+#define  PCI_X_STATUS_133MHZ	0x00020000	/* 133 MHz capable */
+#define  PCI_X_STATUS_SPL_DISC	0x00040000	/* Split Completion Discarded */
+#define  PCI_X_STATUS_UNX_SPL	0x00080000	/* Unexpected Split Completion */
+#define  PCI_X_STATUS_COMPLEX	0x00100000	/* Device Complexity */
+#define  PCI_X_STATUS_MAX_READ	0x00600000	/* Designed Max Memory Read Count */
+#define  PCI_X_STATUS_MAX_SPLIT	0x03800000	/* Designed Max Outstanding Split Transactions */
+#define  PCI_X_STATUS_MAX_CUM	0x1c000000	/* Designed Max Cumulative Read Size */
+#define  PCI_X_STATUS_SPL_ERR	0x20000000	/* Rcvd Split Completion Error Msg */
+#define  PCI_X_STATUS_266MHZ	0x40000000	/* 266 MHz capable */
+#define  PCI_X_STATUS_533MHZ	0x80000000	/* 533 MHz capable */
+
+/* PCI Express capability registers */
+
+#define PCI_EXP_FLAGS		2	/* Capabilities register */
+#define PCI_EXP_FLAGS_VERS	0x000f	/* Capability version */
+#define PCI_EXP_FLAGS_TYPE	0x00f0	/* Device/Port type */
+#define  PCI_EXP_TYPE_ENDPOINT	0x0	/* Express Endpoint */
+#define  PCI_EXP_TYPE_LEG_END	0x1	/* Legacy Endpoint */
+#define  PCI_EXP_TYPE_ROOT_PORT 0x4	/* Root Port */
+#define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
+#define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
+#define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
+#define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
+#define PCI_EXP_DEVCAP		4	/* Device capabilities */
+#define  PCI_EXP_DEVCAP_PAYLOAD	0x07	/* Max_Payload_Size */
+#define  PCI_EXP_DEVCAP_PHANTOM	0x18	/* Phantom functions */
+#define  PCI_EXP_DEVCAP_EXT_TAG	0x20	/* Extended tags */
+#define  PCI_EXP_DEVCAP_L0S	0x1c0	/* L0s Acceptable Latency */
+#define  PCI_EXP_DEVCAP_L1	0xe00	/* L1 Acceptable Latency */
+#define  PCI_EXP_DEVCAP_ATN_BUT	0x1000	/* Attention Button Present */
+#define  PCI_EXP_DEVCAP_ATN_IND	0x2000	/* Attention Indicator Present */
+#define  PCI_EXP_DEVCAP_PWR_IND	0x4000	/* Power Indicator Present */
+#define  PCI_EXP_DEVCAP_PWR_VAL	0x3fc0000 /* Slot Power Limit Value */
+#define  PCI_EXP_DEVCAP_PWR_SCL	0xc000000 /* Slot Power Limit Scale */
+#define PCI_EXP_DEVCTL		8	/* Device Control */
+#define  PCI_EXP_DEVCTL_CERE	0x0001	/* Correctable Error Reporting En. */
+#define  PCI_EXP_DEVCTL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_FERE	0x0004	/* Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_URRE	0x0008	/* Unsupported Request Reporting En. */
+#define  PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */
+#define  PCI_EXP_DEVCTL_PAYLOAD	0x00e0	/* Max_Payload_Size */
+#define  PCI_EXP_DEVCTL_EXT_TAG	0x0100	/* Extended Tag Field Enable */
+#define  PCI_EXP_DEVCTL_PHANTOM	0x0200	/* Phantom Functions Enable */
+#define  PCI_EXP_DEVCTL_AUX_PME	0x0400	/* Auxiliary Power PM Enable */
+#define  PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800  /* Enable No Snoop */
+#define  PCI_EXP_DEVCTL_READRQ	0x7000	/* Max_Read_Request_Size */
+#define PCI_EXP_DEVSTA		10	/* Device Status */
+#define  PCI_EXP_DEVSTA_CED	0x01	/* Correctable Error Detected */
+#define  PCI_EXP_DEVSTA_NFED	0x02	/* Non-Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_FED	0x04	/* Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_URD	0x08	/* Unsupported Request Detected */
+#define  PCI_EXP_DEVSTA_AUXPD	0x10	/* AUX Power Detected */
+#define  PCI_EXP_DEVSTA_TRPND	0x20	/* Transactions Pending */
+#define PCI_EXP_LNKCAP		12	/* Link Capabilities */
+#define PCI_EXP_LNKCTL		16	/* Link Control */
+#define  PCI_EXP_LNKCTL_CLKREQ_EN 0x100	/* Enable clkreq */
+#define PCI_EXP_LNKSTA		18	/* Link Status */
+#define PCI_EXP_SLTCAP		20	/* Slot Capabilities */
+#define PCI_EXP_SLTCTL		24	/* Slot Control */
+#define PCI_EXP_SLTSTA		26	/* Slot Status */
+#define PCI_EXP_RTCTL		28	/* Root Control */
+#define  PCI_EXP_RTCTL_SECEE	0x01	/* System Error on Correctable Error */
+#define  PCI_EXP_RTCTL_SENFEE	0x02	/* System Error on Non-Fatal Error */
+#define  PCI_EXP_RTCTL_SEFEE	0x04	/* System Error on Fatal Error */
+#define  PCI_EXP_RTCTL_PMEIE	0x08	/* PME Interrupt Enable */
+#define  PCI_EXP_RTCTL_CRSSVE	0x10	/* CRS Software Visibility Enable */
+#define PCI_EXP_RTCAP		30	/* Root Capabilities */
+#define PCI_EXP_RTSTA		32	/* Root Status */
+
+/* Extended Capabilities (PCI-X 2.0 and Express) */
+#define PCI_EXT_CAP_ID(header)		(header & 0x0000ffff)
+#define PCI_EXT_CAP_VER(header)		((header >> 16) & 0xf)
+#define PCI_EXT_CAP_NEXT(header)	((header >> 20) & 0xffc)
+
+#define PCI_EXT_CAP_ID_ERR	1
+#define PCI_EXT_CAP_ID_VC	2
+#define PCI_EXT_CAP_ID_DSN	3
+#define PCI_EXT_CAP_ID_PWR	4
+
+/* Advanced Error Reporting */
+#define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
+#define  PCI_ERR_UNC_TRAIN	0x00000001	/* Training */
+#define  PCI_ERR_UNC_DLP	0x00000010	/* Data Link Protocol */
+#define  PCI_ERR_UNC_POISON_TLP	0x00001000	/* Poisoned TLP */
+#define  PCI_ERR_UNC_FCP	0x00002000	/* Flow Control Protocol */
+#define  PCI_ERR_UNC_COMP_TIME	0x00004000	/* Completion Timeout */
+#define  PCI_ERR_UNC_COMP_ABORT	0x00008000	/* Completer Abort */
+#define  PCI_ERR_UNC_UNX_COMP	0x00010000	/* Unexpected Completion */
+#define  PCI_ERR_UNC_RX_OVER	0x00020000	/* Receiver Overflow */
+#define  PCI_ERR_UNC_MALF_TLP	0x00040000	/* Malformed TLP */
+#define  PCI_ERR_UNC_ECRC	0x00080000	/* ECRC Error Status */
+#define  PCI_ERR_UNC_UNSUP	0x00100000	/* Unsupported Request */
+#define PCI_ERR_UNCOR_MASK	8	/* Uncorrectable Error Mask */
+	/* Same bits as above */
+#define PCI_ERR_UNCOR_SEVER	12	/* Uncorrectable Error Severity */
+	/* Same bits as above */
+#define PCI_ERR_COR_STATUS	16	/* Correctable Error Status */
+#define  PCI_ERR_COR_RCVR	0x00000001	/* Receiver Error Status */
+#define  PCI_ERR_COR_BAD_TLP	0x00000040	/* Bad TLP Status */
+#define  PCI_ERR_COR_BAD_DLLP	0x00000080	/* Bad DLLP Status */
+#define  PCI_ERR_COR_REP_ROLL	0x00000100	/* REPLAY_NUM Rollover */
+#define  PCI_ERR_COR_REP_TIMER	0x00001000	/* Replay Timer Timeout */
+#define PCI_ERR_COR_MASK	20	/* Correctable Error Mask */
+	/* Same bits as above */
+#define PCI_ERR_CAP		24	/* Advanced Error Capabilities */
+#define  PCI_ERR_CAP_FEP(x)	((x) & 31)	/* First Error Pointer */
+#define  PCI_ERR_CAP_ECRC_GENC	0x00000020	/* ECRC Generation Capable */
+#define  PCI_ERR_CAP_ECRC_GENE	0x00000040	/* ECRC Generation Enable */
+#define  PCI_ERR_CAP_ECRC_CHKC	0x00000080	/* ECRC Check Capable */
+#define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
+#define PCI_ERR_HEADER_LOG	28	/* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND	44	/* Root Error Command */
+/* Correctable Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_COR_EN		0x00000001
+/* Non-fatal Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_NONFATAL_EN	0x00000002
+/* Fatal Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_FATAL_EN	0x00000004
+#define PCI_ERR_ROOT_STATUS	48
+#define PCI_ERR_ROOT_COR_RCV		0x00000001	/* ERR_COR Received */
+/* Multi ERR_COR Received */
+#define PCI_ERR_ROOT_MULTI_COR_RCV	0x00000002
+/* ERR_FATAL/NONFATAL Recevied */
+#define PCI_ERR_ROOT_UNCOR_RCV		0x00000004
+/* Multi ERR_FATAL/NONFATAL Recevied */
+#define PCI_ERR_ROOT_MULTI_UNCOR_RCV	0x00000008
+#define PCI_ERR_ROOT_FIRST_FATAL	0x00000010	/* First Fatal */
+#define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020	/* Non-Fatal Received */
+#define PCI_ERR_ROOT_FATAL_RCV		0x00000040	/* Fatal Received */
+#define PCI_ERR_ROOT_COR_SRC	52
+#define PCI_ERR_ROOT_SRC	54
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1	4
+#define PCI_VC_PORT_REG2	8
+#define PCI_VC_PORT_CTRL	12
+#define PCI_VC_PORT_STATUS	14
+#define PCI_VC_RES_CAP		16
+#define PCI_VC_RES_CTRL		20
+#define PCI_VC_RES_STATUS	26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR		4	/* Data Select Register */
+#define PCI_PWR_DATA		8	/* Data Register */
+#define  PCI_PWR_DATA_BASE(x)	((x) & 0xff)	    /* Base Power */
+#define  PCI_PWR_DATA_SCALE(x)	(((x) >> 8) & 3)    /* Data Scale */
+#define  PCI_PWR_DATA_PM_SUB(x)	(((x) >> 10) & 7)   /* PM Sub State */
+#define  PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
+#define  PCI_PWR_DATA_TYPE(x)	(((x) >> 15) & 7)   /* Type */
+#define  PCI_PWR_DATA_RAIL(x)	(((x) >> 18) & 7)   /* Power Rail */
+#define PCI_PWR_CAP		12	/* Capability */
+#define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
+
+/*
+ * Hypertransport sub capability types
+ *
+ * Unfortunately there are both 3 bit and 5 bit capability types defined
+ * in the HT spec, catering for that is a little messy. You probably don't
+ * want to use these directly, just use pci_find_ht_capability() and it
+ * will do the right thing for you.
+ */
+#define HT_3BIT_CAP_MASK	0xE0
+#define HT_CAPTYPE_SLAVE	0x00	/* Slave/Primary link configuration */
+#define HT_CAPTYPE_HOST		0x20	/* Host/Secondary link configuration */
+
+#define HT_5BIT_CAP_MASK	0xF8
+#define HT_CAPTYPE_IRQ		0x80	/* IRQ Configuration */
+#define HT_CAPTYPE_REMAPPING_40	0xA0	/* 40 bit address remapping */
+#define HT_CAPTYPE_REMAPPING_64 0xA2	/* 64 bit address remapping */
+#define HT_CAPTYPE_UNITID_CLUMP	0x90	/* Unit ID clumping */
+#define HT_CAPTYPE_EXTCONF	0x98	/* Extended Configuration Space Access */
+#define HT_CAPTYPE_MSI_MAPPING	0xA8	/* MSI Mapping Capability */
+#define  HT_MSI_FLAGS		0x02		/* Offset to flags */
+#define  HT_MSI_FLAGS_ENABLE	0x1		/* Mapping enable */
+#define  HT_MSI_FLAGS_FIXED	0x2		/* Fixed mapping only */
+#define  HT_MSI_FIXED_ADDR	0x00000000FEE00000ULL	/* Fixed addr */
+#define  HT_MSI_ADDR_LO		0x04		/* Offset to low addr bits */
+#define  HT_MSI_ADDR_LO_MASK	0xFFF00000	/* Low address bit mask */
+#define  HT_MSI_ADDR_HI		0x08		/* Offset to high addr bits */
+#define HT_CAPTYPE_DIRECT_ROUTE	0xB0	/* Direct routing configuration */
+#define HT_CAPTYPE_VCSET	0xB8	/* Virtual Channel configuration */
+#define HT_CAPTYPE_ERROR_RETRY	0xC0	/* Retry on error configuration */
+#define HT_CAPTYPE_GEN3		0xD0	/* Generation 3 hypertransport configuration */
+#define HT_CAPTYPE_PM		0xE0	/* Hypertransport powermanagement configuration */
+
+
+#endif /* LINUX_PCI_REGS_H */
diff -Naurp xen/include/xen/sched.h xen-redhat/include/xen/sched.h
--- xen/include/xen/sched.h
+++ xen-redhat/include/xen/sched.h
@@ -31,12 +31,11 @@ extern unsigned long volatile jiffies;
 extern struct domain *dom0;
 
 #ifndef CONFIG_COMPAT
-#define MAX_EVTCHNS(d)     NR_EVENT_CHANNELS
+#define BITS_PER_EVTCHN_WORD(d) BITS_PER_LONG
 #else
-#define MAX_EVTCHNS(d)     (!IS_COMPAT(d) ? \
-                            NR_EVENT_CHANNELS : \
-                            sizeof(unsigned int) * sizeof(unsigned int) * 64)
+#define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG)
 #endif
+#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d))
 #define EVTCHNS_PER_BUCKET 128
 #define NR_EVTCHN_BUCKETS  (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
 
@@ -119,6 +118,8 @@ struct vcpu 
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool_t           paused_for_shutdown;
+    /* VCPU affinity is temporarily locked from controller changes? */
+    bool_t           affinity_locked;
 
     unsigned long    pause_flags;
     atomic_t         pause_count;
@@ -166,7 +167,7 @@ struct domain
 
     /* Event channel information. */
     struct evtchn   *evtchn[NR_EVTCHN_BUCKETS];
-    spinlock_t       evtchn_lock;
+    spinlock_t       event_lock;
 
     struct grant_table *grant_table;
 
@@ -191,9 +192,11 @@ struct domain
     /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
     bool_t           is_polling;
     /* Is this guest dying (i.e., a zombie)? */
-    bool_t           is_dying;
+    enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
     /* Domain is paused by controller software? */
     bool_t           is_paused_by_controller;
+    /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
+    bool_t           is_pinned;
 
     /* Guest has shut down (inc. reason code)? */
     spinlock_t       shutdown_lock;
@@ -224,6 +227,9 @@ struct domain
     int32_t time_offset_seconds;
 
     struct rcu_head rcu;
+    /* HV */
+    atomic_t hard_virt;
+
 };
 
 struct domain_setup_info
@@ -335,7 +341,7 @@ static inline struct domain *rcu_lock_cu
 
 struct domain *get_domain_by_id(domid_t dom);
 void domain_destroy(struct domain *d);
-void domain_kill(struct domain *d);
+int domain_kill(struct domain *d);
 void domain_shutdown(struct domain *d, u8 reason);
 void domain_resume(struct domain *d);
 void domain_pause_for_debugger(void);
@@ -476,6 +482,8 @@ void cpu_init(void);
 
 void vcpu_force_reschedule(struct vcpu *v);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
+int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
+void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 
@@ -487,15 +495,15 @@ static inline void vcpu_unblock(struct v
 
 #define IS_PRIV(_d) ((_d)->is_privileged)
 
-#ifndef IS_COMPAT
-#define IS_COMPAT(d) 0
-#endif
-
 #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist))
 
 #define is_hvm_domain(d) ((d)->is_hvm)
 #define is_hvm_vcpu(v)   (is_hvm_domain(v->domain))
 
+extern enum cpufreq_controller {
+    FREQCTL_none, FREQCTL_dom0_kernel
+} cpufreq_controller;
+
 #endif /* __SCHED_H__ */
 
 /*
diff -Naurp xen/include/xen/serial.h xen-redhat/include/xen/serial.h
--- xen/include/xen/serial.h
+++ xen-redhat/include/xen/serial.h
@@ -3,7 +3,7 @@
  * 
  * Framework for serial device drivers.
  * 
- * Copyright (c) 2003-2005, K A Fraser
+ * Copyright (c) 2003-2008, K A Fraser
  */
 
 #ifndef __XEN_SERIAL_H__
@@ -34,6 +34,7 @@ struct serial_port {
     /* Transmit data buffer (interrupt-driven uart). */
     char               *txbuf;
     unsigned int        txbufp, txbufc;
+    bool_t              tx_quench;
     /* Force synchronous transmit. */
     int                 sync;
     /* Receiver callback functions (asynchronous receivers). */
diff -Naurp xen/include/xen/time.h xen-redhat/include/xen/time.h
--- xen/include/xen/time.h
+++ xen-redhat/include/xen/time.h
@@ -63,6 +63,7 @@ struct tm {
 };
 struct tm gmtime(unsigned long t);
 
+#define SYSTEM_TIME_HZ  1000000000ULL
 #define NOW()           ((s_time_t)get_s_time())
 #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
 #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))