Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4405

kernel-2.6.18-194.11.1.el5.src.rpm

From: ddugger@redhat.com <ddugger@redhat.com>
Date: Mon, 23 Mar 2009 10:22:57 -0600
Subject: [xen] add VT-d specific files
Message-id: 200903231622.n2NGMvxa022045@sobek.n0ano.com
O-Subject: [RHEL5.4 PATCH 4/21 V2] add VT-d specific files
Bugzilla: 484227
RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com>
RH-Acked-by: Chris Lalancette <clalance@redhat.com>

major VT-d code, implement VT-d under xen/drivers/passthrough/vtd/,
do initialization/destroy for VT-d at boot/exit, define irq structures
and functions for VT-d.

Upstream Status: Accepted (CS 15883, 15905, 17195, 17973, 18649)

BZ: 484227

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Gerd Hoffman <kraxel@redhat.com>
Signed-off-by: Don Dugger <donald.d.dugger@intel.com>

diff --git a/arch/x86/acpi/boot.c b/arch/x86/acpi/boot.c
index a4dddc9..f929ecc 100644
--- a/arch/x86/acpi/boot.c
+++ b/arch/x86/acpi/boot.c
@@ -919,6 +919,8 @@ int __init acpi_boot_init(void)
 
 	acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
 
+	acpi_dmar_init();
+
 	return 0;
 }
 
diff --git a/arch/x86/domain.c b/arch/x86/domain.c
index 0651a71..bd1f214 100644
--- a/arch/x86/domain.c
+++ b/arch/x86/domain.c
@@ -44,6 +44,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/msr.h>
+#include <xen/iommu.h>
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
 #endif
@@ -416,6 +417,8 @@ int arch_domain_create(struct domain *d)
     int vcpuid, pdpt_order;
     int rc = -ENOMEM;
 
+    INIT_LIST_HEAD(&d->arch.pdev_list);
+
     d->arch.relmem = RELMEM_not_started;
     INIT_LIST_HEAD(&d->arch.relmem_list);
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
@@ -480,12 +483,18 @@ int arch_domain_create(struct domain *d)
         clear_page(d->shared_info);
         share_xen_page_with_guest(
             virt_to_page(d->shared_info), d, XENSHARE_writable);
+
+        if ( (rc = iommu_domain_init(d)) != 0 )
+            goto fail;
     }
 
     if ( is_hvm_domain(d) )
     {
         if ( (rc = hvm_domain_initialise(d)) != 0 )
+        {
+            iommu_domain_destroy(d);
             goto fail;
+        }
     }
     else
     {
@@ -514,6 +523,10 @@ void arch_domain_destroy(struct domain *d)
     if ( is_hvm_domain(d) )
         hvm_domain_destroy(d);
 
+    pci_release_devices(d);
+    if ( !is_idle_domain(d) )
+        iommu_domain_destroy(d);
+
     paging_final_teardown(d);
 
     free_xenheap_pages(
diff --git a/arch/x86/irq.c b/arch/x86/irq.c
index 4d17471..b9d761c 100644
--- a/arch/x86/irq.c
+++ b/arch/x86/irq.c
@@ -98,6 +98,39 @@ asmlinkage void do_IRQ(struct cpu_user_regs *regs)
     spin_unlock(&desc->lock);
 }
 
+int request_irq(unsigned int irq,
+        void (*handler)(int, void *, struct cpu_user_regs *),
+        unsigned long irqflags, const char * devname, void *dev_id)
+{
+    struct irqaction * action;
+    int retval;
+
+    /*
+     * Sanity-check: shared interrupts must pass in a real dev-ID,
+     * otherwise we'll have trouble later trying to figure out
+     * which interrupt is which (messes up the interrupt freeing
+     * logic etc).
+     */
+    if (irq >= NR_IRQS)
+        return -EINVAL;
+    if (!handler)
+        return -EINVAL;
+
+    action = xmalloc(struct irqaction);
+    if (!action)
+        return -ENOMEM;
+
+    action->handler = handler;
+    action->name = devname;
+    action->dev_id = dev_id;
+
+    retval = setup_irq(irq, action);
+    if (retval)
+        xfree(action);
+
+    return retval;
+}
+ 
 void free_irq(unsigned int irq)
 {
     unsigned int  vector = irq_to_vector(irq);
diff --git a/arch/x86/setup.c b/arch/x86/setup.c
index c621786..4ff4f6b 100644
--- a/arch/x86/setup.c
+++ b/arch/x86/setup.c
@@ -1122,6 +1122,14 @@ void arch_get_xen_caps(xen_capabilities_info_t *info)
 #endif
 }
 
+int xen_in_range(paddr_t start, paddr_t end)
+{
+    start = max_t(paddr_t, start, xenheap_phys_start);
+    end = min_t(paddr_t, end, xenheap_phys_end);
+
+    return start < end;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/drivers/Makefile b/drivers/Makefile
index 1a41cee..fccfbb3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -1,4 +1,6 @@
 subdir-y += char
 subdir-y += pci
+subdir-$(x86_32) += passthrough
+subdir-$(x86_64) += passthrough
 subdir-$(HAS_ACPI) += acpi
 subdir-$(HAS_VGA) += video
diff --git a/drivers/passthrough/Makefile b/drivers/passthrough/Makefile
new file mode 100644
index 0000000..662f89c
--- /dev/null
+++ b/drivers/passthrough/Makefile
@@ -0,0 +1,5 @@
+subdir-$(x86_32) += vtd
+subdir-$(x86_64) += vtd
+
+obj-y += iommu.o
+obj-y += pci.o
diff --git a/drivers/passthrough/iommu.c b/drivers/passthrough/iommu.c
new file mode 100644
index 0000000..ea345c7
--- /dev/null
+++ b/drivers/passthrough/iommu.c
@@ -0,0 +1,250 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/sched.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <xen/paging.h>
+#include <xen/guest_access.h>
+
+static void parse_iommu_param(char *s);
+int intel_vtd_setup(void);
+
+/*
+ * The 'iommu' parameter enables the IOMMU.  Optional comma separated
+ * value may contain:
+ *
+ *   off|no|false|disable       Disable IOMMU (default)
+ *   force|required             Don't boot unless IOMMU is enabled
+ *   passthrough                Bypass VT-d translation for Dom0
+ */
+custom_param("iommu", parse_iommu_param);
+int iommu_enabled = 0;
+int force_iommu = 0;
+int iommu_passthrough = 0;
+
+static void __init parse_iommu_param(char *s)
+{
+    char *ss;
+    iommu_enabled = 1;
+
+    do {
+        ss = strchr(s, ',');
+        if ( ss )
+            *ss = '\0';
+
+        if ( !strcmp(s, "off") || !strcmp(s, "no") || !strcmp(s, "false") ||
+             !strcmp(s, "0") || !strcmp(s, "disable") )
+            iommu_enabled = 0;
+        else if ( !strcmp(s, "force") || !strcmp(s, "required") )
+            force_iommu = 1;
+        else if ( !strcmp(s, "passthrough") )
+            iommu_passthrough = 1;
+
+        s = ss + 1;
+    } while ( ss );
+}
+
+int iommu_domain_init(struct domain *domain)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+
+    spin_lock_init(&hd->mapping_lock);
+    INIT_LIST_HEAD(&hd->g2m_ioport_list);
+
+    if ( !iommu_enabled )
+        return 0;
+
+    hd->platform_ops = iommu_get_ops();
+    return hd->platform_ops->init(domain);
+}
+
+int iommu_add_device(struct pci_dev *pdev)
+{
+    struct hvm_iommu *hd;
+
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    hd = domain_hvm_iommu(pdev->domain);
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->add_device(pdev);
+}
+
+int iommu_remove_device(struct pci_dev *pdev)
+{
+    struct hvm_iommu *hd;
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    hd = domain_hvm_iommu(pdev->domain);
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->remove_device(pdev);
+}
+
+int assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    int rc = 0;
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    spin_lock(&pcidevs_lock);
+    rc = hd->platform_ops->assign_device(d, bus, devfn);
+    spin_unlock(&pcidevs_lock);
+    return rc;
+}
+
+void iommu_domain_destroy(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    struct list_head *ioport_list, *tmp;
+    struct g2m_ioport *ioport;
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return;
+
+    if ( hd )
+    {
+        list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
+        {
+            ioport = list_entry(ioport_list, struct g2m_ioport, list);
+            list_del(&ioport->list);
+            xfree(ioport);
+        }
+    }
+
+    return hd->platform_ops->teardown(d);
+}
+
+int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->map_page(d, gfn, mfn);
+}
+
+int iommu_unmap_page(struct domain *d, unsigned long gfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return 0;
+
+    return hd->platform_ops->unmap_page(d, gfn);
+}
+
+/* caller should hold the pcidevs_lock */
+int deassign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct pci_dev *pdev = NULL;
+
+    if ( !iommu_enabled || !hd->platform_ops )
+        return -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(bus, devfn);
+    if (!pdev)
+        return -ENODEV;
+
+    if (pdev->domain != d)
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: deassign a device not owned\n");
+        return -EINVAL;
+    }
+
+    return hd->platform_ops->reassign_device(d, dom0, bus, devfn);
+}
+
+static int iommu_setup(void)
+{
+    int rc = -ENODEV;
+
+    if ( !iommu_enabled )
+        goto out;
+
+    rc = iommu_hardware_setup();
+
+    iommu_enabled = (rc == 0);
+
+ out:
+    if ( force_iommu && !iommu_enabled )
+        panic("IOMMU setup failed, crash Xen for security purpose!\n");
+
+    printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
+    return rc;
+}
+__initcall(iommu_setup);
+
+int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn,
+    XEN_GUEST_HANDLE_64(uint32_t) buf, int max_sdevs)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct pci_dev *pdev;
+    int group_id, sdev_id;
+    u32 bdf;
+    int i = 0;
+    struct iommu_ops *ops = hd->platform_ops;
+
+    if ( !iommu_enabled || !ops || !ops->get_device_group_id )
+        return 0;
+
+    group_id = ops->get_device_group_id(bus, devfn);
+
+    spin_lock(&pcidevs_lock);
+    for_each_pdev( d, pdev )
+    {
+        if ( (pdev->bus == bus) && (pdev->devfn == devfn) )
+            continue;
+
+        sdev_id = ops->get_device_group_id(pdev->bus, pdev->devfn);
+        if ( (sdev_id == group_id) && (i < max_sdevs) )
+        {
+            bdf = 0;
+            bdf |= (pdev->bus & 0xff) << 16;
+            bdf |= (pdev->devfn & 0xff) << 8;
+            if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
+            {
+                spin_unlock(&pcidevs_lock);
+                return -1;
+            }
+            i++;
+        }
+    }
+    spin_unlock(&pcidevs_lock);
+
+    return i;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/drivers/passthrough/pci.c b/drivers/passthrough/pci.c
new file mode 100644
index 0000000..c9a744c
--- /dev/null
+++ b/drivers/passthrough/pci.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2008,  Netronome Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/sched.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/list.h>
+#include <xen/prefetch.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <asm/hvm/irq.h>
+#include <xen/delay.h>
+#include <xen/keyhandler.h>
+
+
+LIST_HEAD(alldevs_list);
+spinlock_t pcidevs_lock = SPIN_LOCK_UNLOCKED;
+
+struct pci_dev *alloc_pdev(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+        if ( pdev->bus == bus && pdev->devfn == devfn )
+            return pdev;
+
+    pdev = xmalloc(struct pci_dev);
+    if ( !pdev )
+        return NULL;
+    memset(pdev, 0, sizeof(struct pci_dev));
+
+    *((u8*) &pdev->bus) = bus;
+    *((u8*) &pdev->devfn) = devfn;
+    pdev->domain = NULL;
+    list_add(&pdev->alldevs_list, &alldevs_list);
+
+    return pdev;
+}
+
+void free_pdev(struct pci_dev *pdev)
+{
+    list_del(&pdev->alldevs_list);
+    xfree(pdev);
+}
+
+struct pci_dev *pci_get_pdev(int bus, int devfn)
+{
+    struct pci_dev *pdev = NULL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+        if ( (pdev->bus == bus || bus == -1) &&
+             (pdev->devfn == devfn || devfn == -1) )
+        {
+            return pdev;
+        }
+
+    return NULL;
+}
+
+struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn)
+{
+    struct pci_dev *pdev = NULL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+         if ( (pdev->bus == bus || bus == -1) &&
+              (pdev->devfn == devfn || devfn == -1) &&
+              (pdev->domain == d) )
+         {
+             return pdev;
+         }
+
+    return NULL;
+}
+
+int pci_add_device(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+    int ret = -ENOMEM;
+
+    spin_lock(&pcidevs_lock);
+    pdev = alloc_pdev(bus, devfn);
+    if ( !pdev )
+        goto out;
+
+    ret = 0;
+    if ( !pdev->domain )
+    {
+        pdev->domain = dom0;
+        ret = iommu_add_device(pdev);
+        if ( ret )
+            goto out;
+
+        list_add(&pdev->domain_list, &dom0->arch.pdev_list);
+    }
+
+out:
+    spin_unlock(&pcidevs_lock);
+    printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus,
+           PCI_SLOT(devfn), PCI_FUNC(devfn));
+    return ret;
+}
+
+int pci_remove_device(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+    int ret = -ENODEV;;
+
+    spin_lock(&pcidevs_lock);
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+        if ( pdev->bus == bus && pdev->devfn == devfn )
+        {
+            ret = iommu_remove_device(pdev);
+            if ( pdev->domain )
+                list_del(&pdev->domain_list);
+            free_pdev(pdev);
+            printk(XENLOG_DEBUG "PCI remove device %02x:%02x.%x\n", bus,
+                   PCI_SLOT(devfn), PCI_FUNC(devfn));
+            break;
+        }
+
+    spin_unlock(&pcidevs_lock);
+    return ret;
+}
+
+static void pci_clean_dpci_irqs(struct domain *d)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
+    uint32_t i;
+    struct list_head *digl_list, *tmp;
+    struct dev_intx_gsi_link *digl;
+
+    if ( !iommu_enabled )
+        return;
+
+    spin_lock(&d->evtchn_lock);
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+    if ( hvm_irq_dpci != NULL )
+    {
+        for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS);
+              i < NR_IRQS;
+              i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) )
+        {
+            pirq_guest_unbind(d, i);
+            kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]);
+
+            list_for_each_safe ( digl_list, tmp,
+                                 &hvm_irq_dpci->mirq[i].digl_list )
+            {
+                digl = list_entry(digl_list,
+                                  struct dev_intx_gsi_link, list);
+                list_del(&digl->list);
+                xfree(digl);
+            }
+        }
+
+        d->arch.hvm_domain.irq.dpci = NULL;
+        xfree(hvm_irq_dpci);
+    }
+    spin_unlock(&d->evtchn_lock);
+}
+
+void pci_release_devices(struct domain *d)
+{
+    struct pci_dev *pdev;
+    u8 bus, devfn;
+
+    spin_lock(&pcidevs_lock);
+    pci_clean_dpci_irqs(d);
+    while ( (pdev = pci_get_pdev_by_domain(d, -1, -1)) )
+    {
+        bus = pdev->bus; devfn = pdev->devfn;
+        deassign_device(d, bus, devfn);
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/drivers/passthrough/vtd/Makefile b/drivers/passthrough/vtd/Makefile
new file mode 100644
index 0000000..61bc48b
--- /dev/null
+++ b/drivers/passthrough/vtd/Makefile
@@ -0,0 +1,6 @@
+subdir-$(x86_32) += x86
+subdir-$(x86_64) += x86
+
+obj-y += iommu.o
+obj-y += dmar.o
+obj-y += utils.o
diff --git a/drivers/passthrough/vtd/dmar.c b/drivers/passthrough/vtd/dmar.c
new file mode 100644
index 0000000..740a7e7
--- /dev/null
+++ b/drivers/passthrough/vtd/dmar.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
+ */
+
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/kernel.h>
+#include <xen/acpi.h>
+#include <xen/mm.h>
+#include <xen/xmalloc.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <asm/string.h>
+#include "dmar.h"
+
+int vtd_enabled = 1;
+
+#undef PREFIX
+#define PREFIX VTDPREFIX "ACPI DMAR:"
+#define DEBUG
+
+#define MIN_SCOPE_LEN (sizeof(struct acpi_pci_path) + \
+                       sizeof(struct acpi_dev_scope))
+
+LIST_HEAD(acpi_drhd_units);
+LIST_HEAD(acpi_rmrr_units);
+LIST_HEAD(acpi_atsr_units);
+
+u8 dmar_host_address_width;
+
+void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus)
+{
+    sub_bus &= 0xff;
+    if (sec_bus > sub_bus)
+        return;
+
+    while ( sec_bus <= sub_bus )
+        set_bit(sec_bus++, scope->buses);
+}
+
+void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus)
+{
+    sub_bus &= 0xff;
+    if (sec_bus > sub_bus)
+        return;
+
+    while ( sec_bus <= sub_bus )
+        clear_bit(sec_bus++, scope->buses);
+}
+
+static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd)
+{
+    /*
+     * add INCLUDE_ALL at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if ( drhd->include_all )
+        list_add_tail(&drhd->list, &acpi_drhd_units);
+    else
+        list_add(&drhd->list, &acpi_drhd_units);
+    return 0;
+}
+
+static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr)
+{
+    list_add(&rmrr->list, &acpi_rmrr_units);
+    return 0;
+}
+
+static void __init disable_all_dmar_units(void)
+{
+    struct acpi_drhd_unit *drhd, *_drhd;
+    struct acpi_rmrr_unit *rmrr, *_rmrr;
+    struct acpi_atsr_unit *atsr, *_atsr;
+
+    list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list )
+    {
+        list_del(&drhd->list);
+        xfree(drhd);
+    }
+    list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list )
+    {
+        list_del(&rmrr->list);
+        xfree(rmrr);
+    }
+    list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list )
+    {
+        list_del(&atsr->list);
+        xfree(atsr);
+    }
+}
+
+static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr)
+{
+    /*
+     * add ALL_PORTS at the tail, so scan the list will find it at
+     * the very end.
+     */
+    if ( atsr->all_ports )
+        list_add_tail(&atsr->list, &acpi_atsr_units);
+    else
+        list_add(&atsr->list, &acpi_atsr_units);
+    return 0;
+}
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct acpi_drhd_unit *found = NULL, *include_all = NULL;
+    int i;
+
+    list_for_each_entry ( drhd, &acpi_drhd_units, list )
+    {
+        for (i = 0; i < drhd->scope.devices_cnt; i++)
+            if ( drhd->scope.devices[i] == PCI_BDF2(bus, devfn) )
+                return drhd;
+
+        if ( test_bit(bus, drhd->scope.buses) )
+            found = drhd;
+
+        if ( drhd->include_all )
+            include_all = drhd;
+    }
+
+    return found ? found : include_all;
+}
+
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn)
+{
+    struct acpi_atsr_unit *atsr;
+    struct acpi_atsr_unit *found = NULL, *include_all = NULL;
+    int i;
+
+    list_for_each_entry ( atsr, &acpi_atsr_units, list )
+    {
+        for (i = 0; i < atsr->scope.devices_cnt; i++)
+            if ( atsr->scope.devices[i] == PCI_BDF2(bus, devfn) )
+                return atsr;
+
+        if ( test_bit(bus, atsr->scope.buses) )
+            found = atsr;
+
+        if ( atsr->all_ports )
+            include_all = atsr;
+    }
+
+    return found ? found : include_all;
+}
+
+/*
+ * Count number of devices in device scope.  Do not include PCI sub
+ * hierarchies.
+ */
+static int scope_device_count(void *start, void *end)
+{
+    struct acpi_dev_scope *scope;
+    int count = 0;
+
+    while ( start < end )
+    {
+        scope = start;
+        if ( (scope->length < MIN_SCOPE_LEN) ||
+             (scope->dev_type >= ACPI_DEV_ENTRY_COUNT) )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX, "Invalid device scope.\n");
+            return -EINVAL;
+        }
+
+        if ( scope->dev_type == ACPI_DEV_ENDPOINT ||
+             scope->dev_type == ACPI_DEV_IOAPIC ||
+             scope->dev_type == ACPI_DEV_MSI_HPET )
+            count++;
+
+        start += scope->length;
+    }
+
+    return count;
+}
+
+
+static int __init acpi_parse_dev_scope(void *start, void *end,
+                                       void *acpi_entry, int type)
+{
+    struct dmar_scope *scope = acpi_entry;
+    struct acpi_ioapic_unit *acpi_ioapic_unit;
+    struct acpi_dev_scope *acpi_scope;
+    u16 bus, sub_bus, sec_bus;
+    struct acpi_pci_path *path;
+    int depth, cnt, didx = 0;
+
+    if ( (cnt = scope_device_count(start, end)) < 0 )
+        return cnt;
+
+    scope->devices_cnt = cnt;
+    if ( cnt > 0 )
+    {
+        scope->devices = xmalloc_array(u16, cnt);
+        if ( !scope->devices )
+            return -ENOMEM;
+        memset(scope->devices, 0, sizeof(u16) * cnt);
+    }
+
+    while ( start < end )
+    {
+        acpi_scope = start;
+        path = (struct acpi_pci_path *)(acpi_scope + 1);
+        depth = (acpi_scope->length - sizeof(struct acpi_dev_scope))
+		    / sizeof(struct acpi_pci_path);
+        bus = acpi_scope->start_bus;
+
+        while ( --depth > 0 )
+        {
+            bus = pci_conf_read8(bus, path->dev, path->fn, PCI_SECONDARY_BUS);
+            path++;
+        }
+
+        switch ( acpi_scope->dev_type )
+        {
+        case ACPI_DEV_P2PBRIDGE:
+            sec_bus = pci_conf_read8(
+                bus, path->dev, path->fn, PCI_SECONDARY_BUS);
+            sub_bus = pci_conf_read8(
+                bus, path->dev, path->fn, PCI_SUBORDINATE_BUS);
+            dprintk(XENLOG_INFO VTDPREFIX,
+                    "found bridge: bdf = %x:%x.%x  sec = %x  sub = %x\n",
+                    bus, path->dev, path->fn, sec_bus, sub_bus);
+
+            dmar_scope_add_buses(scope, sec_bus, sub_bus);
+            break;
+
+        case ACPI_DEV_MSI_HPET:
+            dprintk(XENLOG_INFO VTDPREFIX, "found MSI HPET: bdf = %x:%x.%x\n",
+                    bus, path->dev, path->fn);
+            scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
+            break;
+
+        case ACPI_DEV_ENDPOINT:
+            dprintk(XENLOG_INFO VTDPREFIX, "found endpoint: bdf = %x:%x.%x\n",
+                    bus, path->dev, path->fn);
+            scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
+            break;
+
+        case ACPI_DEV_IOAPIC:
+            dprintk(XENLOG_INFO VTDPREFIX, "found IOAPIC: bdf = %x:%x.%x\n",
+                    bus, path->dev, path->fn);
+
+            if ( type == DMAR_TYPE )
+            {
+                struct acpi_drhd_unit *drhd = acpi_entry;
+                acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit);
+                if ( !acpi_ioapic_unit )
+                    return -ENOMEM;
+                acpi_ioapic_unit->apic_id = acpi_scope->enum_id;
+                acpi_ioapic_unit->ioapic.bdf.bus = bus;
+                acpi_ioapic_unit->ioapic.bdf.dev = path->dev;
+                acpi_ioapic_unit->ioapic.bdf.func = path->fn;
+                list_add(&acpi_ioapic_unit->list, &drhd->ioapic_list);
+            }
+
+            scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
+            break;
+        }
+
+        start += acpi_scope->length;
+   }
+
+    return 0;
+}
+
+static int __init
+acpi_parse_one_drhd(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_drhd * drhd = (struct acpi_table_drhd *)header;
+    void *dev_scope_start, *dev_scope_end;
+    struct acpi_drhd_unit *dmaru;
+    int ret = 0;
+    static int include_all = 0;
+
+    dmaru = xmalloc(struct acpi_drhd_unit);
+    if ( !dmaru )
+        return -ENOMEM;
+    memset(dmaru, 0, sizeof(struct acpi_drhd_unit));
+
+    dmaru->address = drhd->address;
+    dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
+    INIT_LIST_HEAD(&dmaru->ioapic_list);
+    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n",
+            dmaru->address);
+
+    dev_scope_start = (void *)(drhd + 1);
+    dev_scope_end = ((void *)drhd) + header->length;
+    ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+                               dmaru, DMAR_TYPE);
+
+    if ( dmaru->include_all )
+    {
+        dprintk(XENLOG_INFO VTDPREFIX, "found INCLUDE_ALL\n");
+        /* Only allow one INCLUDE_ALL */
+        if ( include_all )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Only one INCLUDE_ALL device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        include_all = 1;
+    }
+
+    if ( ret )
+        xfree(dmaru);
+    else
+        acpi_register_drhd_unit(dmaru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_rmrr *rmrr = (struct acpi_table_rmrr *)header;
+    struct acpi_rmrr_unit *rmrru;
+    void *dev_scope_start, *dev_scope_end;
+    int ret = 0;
+
+    if ( rmrr->base_address >= rmrr->end_address )
+    {
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n",
+                rmrr->base_address, rmrr->end_address);
+        return -EFAULT;
+    }
+
+    rmrru = xmalloc(struct acpi_rmrr_unit);
+    if ( !rmrru )
+        return -ENOMEM;
+    memset(rmrru, 0, sizeof(struct acpi_rmrr_unit));
+
+    rmrru->base_address = rmrr->base_address;
+    rmrru->end_address = rmrr->end_address;
+    dev_scope_start = (void *)(rmrr + 1);
+    dev_scope_end   = ((void *)rmrr) + header->length;
+    ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+                               rmrru, RMRR_TYPE);
+
+    if ( ret || (rmrru->scope.devices_cnt == 0) )
+        xfree(rmrru);
+    else
+        acpi_register_rmrr_unit(rmrru);
+    return ret;
+}
+
+static int __init
+acpi_parse_one_atsr(struct acpi_dmar_entry_header *header)
+{
+    struct acpi_table_atsr *atsr = (struct acpi_table_atsr *)header;
+    struct acpi_atsr_unit *atsru;
+    int ret = 0;
+    static int all_ports;
+    void *dev_scope_start, *dev_scope_end;
+
+    atsru = xmalloc(struct acpi_atsr_unit);
+    if ( !atsru )
+        return -ENOMEM;
+    memset(atsru, 0, sizeof(struct acpi_atsr_unit));
+
+    atsru->all_ports = atsr->flags & 1; /* BIT0: ALL_PORTS */
+    if ( !atsru->all_ports )
+    {
+        dev_scope_start = (void *)(atsr + 1);
+        dev_scope_end   = ((void *)atsr) + header->length;
+        ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+                                   atsru, ATSR_TYPE);
+    }
+    else
+    {
+        dprintk(XENLOG_INFO VTDPREFIX, "found ALL_PORTS\n");
+        /* Only allow one ALL_PORTS */
+        if ( all_ports )
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Only one ALL_PORTS device scope is allowed\n");
+            ret = -EINVAL;
+        }
+        all_ports = 1;
+    }
+
+    if ( ret )
+        xfree(atsr);
+    else
+        acpi_register_atsr_unit(atsru);
+    return ret;
+}
+
+static int __init acpi_parse_dmar(unsigned long phys_addr,
+                                  unsigned long size)
+{
+    struct acpi_table_dmar *dmar;
+    struct acpi_dmar_entry_header *entry_header;
+    int ret = 0;
+
+    if ( !phys_addr || !size )
+        return -EINVAL;
+
+    dmar = (struct acpi_table_dmar *)__acpi_map_table(phys_addr, size);
+    if ( !dmar )
+    {
+        dprintk(XENLOG_WARNING VTDPREFIX, "Unable to map DMAR\n");
+        return -ENODEV;
+    }
+
+    if ( !dmar->haw )
+    {
+        dprintk(XENLOG_WARNING VTDPREFIX, "Zero: Invalid DMAR width\n");
+        if ( force_iommu )
+            panic("acpi_parse_dmar: Invalid DMAR width,"
+                  " crash Xen for security purpose!\n");
+        return -EINVAL;
+    }
+
+    dmar_host_address_width = dmar->haw + 1;
+    dprintk(XENLOG_INFO VTDPREFIX, "Host address width %d\n",
+            dmar_host_address_width);
+
+    entry_header = (struct acpi_dmar_entry_header *)(dmar + 1);
+    while ( ((unsigned long)entry_header) <
+            (((unsigned long)dmar) + size) )
+    {
+        switch ( entry_header->type )
+        {
+        case ACPI_DMAR_DRHD:
+            dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_DRHD\n");
+            ret = acpi_parse_one_drhd(entry_header);
+            break;
+        case ACPI_DMAR_RMRR:
+            dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_RMRR\n");
+            ret = acpi_parse_one_rmrr(entry_header);
+            break;
+        case ACPI_DMAR_ATSR:
+            dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_ATSR\n");
+            ret = acpi_parse_one_atsr(entry_header);
+            break;
+        default:
+            dprintk(XENLOG_WARNING VTDPREFIX, "Unknown DMAR structure type\n");
+            ret = -EINVAL;
+            break;
+        }
+        if ( ret )
+            break;
+
+        entry_header = ((void *)entry_header + entry_header->length);
+    }
+
+    /* Zap APCI DMAR signature to prevent dom0 using vt-d HW. */
+    dmar->header.signature[0] = '\0';
+
+    if ( ret )
+    {
+        if ( force_iommu )
+            panic("acpi_parse_dmar: Failed to parse ACPI DMAR,"
+                  " crash Xen for security purpose!\n");
+        else
+        {
+            printk(XENLOG_WARNING
+                   "Failed to parse ACPI DMAR.  Disabling VT-d.\n");
+            disable_all_dmar_units();
+        }
+    }
+
+    return ret;
+}
+
+int acpi_dmar_init(void)
+{
+    int rc;
+
+    rc = -ENODEV;
+    if ( force_iommu )
+        iommu_enabled = 1;
+
+    if ( !iommu_enabled )
+        goto fail;
+
+    acpi_table_parse(ACPI_DMAR, acpi_parse_dmar);
+
+    if ( list_empty(&acpi_drhd_units) )
+        goto fail;
+
+    printk("Intel VT-d has been enabled\n");
+
+    return 0;
+
+ fail:
+    if ( force_iommu )
+        panic("acpi_dmar_init: acpi_dmar_init failed,"
+              " crash Xen for security purpose!\n");
+
+    vtd_enabled = 0;
+    return rc;
+}
diff --git a/drivers/passthrough/vtd/dmar.h b/drivers/passthrough/vtd/dmar.h
new file mode 100644
index 0000000..c2132e2
--- /dev/null
+++ b/drivers/passthrough/vtd/dmar.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ */
+
+#ifndef _DMAR_H_
+#define _DMAR_H_
+
+#include <xen/list.h>
+#include <xen/iommu.h>
+
+extern u8 dmar_host_address_width;
+
+/* This one is for interrupt remapping */
+struct acpi_ioapic_unit {
+    struct list_head list;
+    int apic_id;
+    union {
+        u16 info;
+        struct {
+            u16 func: 3,
+                dev:  5,
+                bus:  8;
+        }bdf;
+    }ioapic;
+};
+
+struct dmar_scope {
+    DECLARE_BITMAP(buses, 256);         /* buses owned by this unit */
+    u16    *devices;                    /* devices owned by this unit */
+    int    devices_cnt;
+};
+
+struct acpi_drhd_unit {
+    struct dmar_scope scope;            /* must be first member of struct */
+    struct list_head list;
+    u64    address;                     /* register base address of the unit */
+    u8     include_all:1;
+    struct iommu *iommu;
+    struct list_head ioapic_list;
+};
+
+struct acpi_rmrr_unit {
+    struct dmar_scope scope;            /* must be first member of struct */
+    struct list_head list;
+    u64    base_address;
+    u64    end_address;
+    u8     allow_all:1;
+};
+
+struct acpi_atsr_unit {
+    struct dmar_scope scope;            /* must be first member of struct */
+    struct list_head list;
+    u8     all_ports:1;
+};
+
+
+#define for_each_drhd_unit(drhd) \
+    list_for_each_entry(drhd, &acpi_drhd_units, list)
+
+#define for_each_rmrr_device(rmrr, bdf, idx)            \
+    list_for_each_entry(rmrr, &acpi_rmrr_units, list)   \
+        /* assume there never is a bdf == 0 */          \
+        for (idx = 0; (bdf = rmrr->scope.devices[idx]) && \
+                 idx < rmrr->scope.devices_cnt; idx++)
+
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn);
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn);
+void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec, u16 sub);
+void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec, u16 sub);
+
+#define DMAR_TYPE 1
+#define RMRR_TYPE 2
+#define ATSR_TYPE 3
+
+#define DMAR_OPERATION_TIMEOUT MILLISECS(1000)
+
+int vtd_hw_check(void);
+void disable_pmr(struct iommu *iommu);
+int is_usb_device(u8 bus, u8 devfn);
+
+#endif /* _DMAR_H_ */
diff --git a/drivers/passthrough/vtd/extern.h b/drivers/passthrough/vtd/extern.h
new file mode 100644
index 0000000..6fec3e5
--- /dev/null
+++ b/drivers/passthrough/vtd/extern.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#ifndef _VTD_EXTERN_H_
+#define _VTD_EXTERN_H_
+
+#include "dmar.h"
+
+
+void print_iommu_regs(struct acpi_drhd_unit *drhd);
+void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn);
+void dump_iommu_info(unsigned char key);
+
+void clear_fault_bits(struct iommu *iommu);
+
+#endif // _VTD_EXTERN_H_
diff --git a/drivers/passthrough/vtd/iommu.c b/drivers/passthrough/vtd/iommu.c
new file mode 100644
index 0000000..988182f
--- /dev/null
+++ b/drivers/passthrough/vtd/iommu.c
@@ -0,0 +1,1884 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
+ */
+
+#include <xen/irq.h>
+#include <xen/sched.h>
+#include <xen/xmalloc.h>
+#include <xen/domain_page.h>
+#include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <xen/numa.h>
+#include <xen/time.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/keyhandler.h>
+#include "iommu.h"
+#include "dmar.h"
+#include "extern.h"
+#include "vtd.h"
+
+#define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid)
+
+static spinlock_t domid_bitmap_lock;    /* protect domain id bitmap */
+static int domid_bitmap_size;           /* domain id bitmap size in bits */
+static unsigned long *domid_bitmap;     /* iommu domain id bitmap */
+
+static void setup_dom0_devices(struct domain *d);
+static void setup_dom0_rmrr(struct domain *d);
+
+#define DID_FIELD_WIDTH 16
+#define DID_HIGH_OFFSET 8
+static void context_set_domain_id(struct context_entry *context,
+                                  struct domain *d)
+{
+    domid_t iommu_domid = domain_iommu_domid(d);
+
+    if ( iommu_domid == 0 )
+    {
+        spin_lock(&domid_bitmap_lock);
+        iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
+        set_bit(iommu_domid, domid_bitmap);
+        spin_unlock(&domid_bitmap_lock);
+        d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
+    }
+
+    context->hi &= (1 << DID_HIGH_OFFSET) - 1;
+    context->hi |= iommu_domid << DID_HIGH_OFFSET;
+}
+
+static void iommu_domid_release(struct domain *d)
+{
+    domid_t iommu_domid = domain_iommu_domid(d);
+
+    if ( iommu_domid != 0 )
+    {
+        d->arch.hvm_domain.hvm_iommu.iommu_domid = 0;
+        clear_bit(iommu_domid, domid_bitmap);
+    }
+}
+
+static struct intel_iommu *alloc_intel_iommu(void)
+{
+    struct intel_iommu *intel;
+
+    intel = xmalloc(struct intel_iommu);
+    if ( intel == NULL )
+        return NULL;
+    memset(intel, 0, sizeof(struct intel_iommu));
+
+    return intel;
+}
+
+static void free_intel_iommu(struct intel_iommu *intel)
+{
+    xfree(intel);
+}
+
+struct iommu_flush *iommu_get_flush(struct iommu *iommu)
+{
+    return iommu ? &iommu->intel->flush : NULL;
+}
+
+static unsigned int clflush_size;
+static int iommus_incoherent;
+static void __iommu_flush_cache(void *addr, int size)
+{
+    int i;
+
+    if ( !iommus_incoherent )
+        return;
+
+    for ( i = 0; i < size; i += clflush_size )
+        cacheline_flush((char *)addr + i);
+}
+
+void iommu_flush_cache_entry(void *addr)
+{
+    __iommu_flush_cache(addr, 8);
+}
+
+void iommu_flush_cache_page(void *addr)
+{
+    __iommu_flush_cache(addr, PAGE_SIZE_4K);
+}
+
+int nr_iommus;
+/* context entry handling */
+static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
+{
+    struct root_entry *root, *root_entries;
+    u64 maddr;
+
+    ASSERT(spin_is_locked(&iommu->lock));
+    root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
+    root = &root_entries[bus];
+    if ( !root_present(*root) )
+    {
+        maddr = alloc_pgtable_maddr(NULL);
+        if ( maddr == 0 )
+        {
+            unmap_vtd_domain_page(root_entries);
+            return 0;
+        }
+        set_root_value(*root, maddr);
+        set_root_present(*root);
+        iommu_flush_cache_entry(root);
+    }
+    maddr = (u64) get_context_addr(*root);
+    unmap_vtd_domain_page(root_entries);
+    return maddr;
+}
+
+static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    int addr_width = agaw_to_width(hd->agaw);
+    struct dma_pte *parent, *pte = NULL;
+    int level = agaw_to_level(hd->agaw);
+    int offset;
+    u64 pte_maddr = 0, maddr;
+    u64 *vaddr = NULL;
+
+    addr &= (((u64)1) << addr_width) - 1;
+    ASSERT(spin_is_locked(&hd->mapping_lock));
+    if ( hd->pgd_maddr == 0 )
+        if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) )
+            goto out;
+
+    parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
+    while ( level > 1 )
+    {
+        offset = address_level_offset(addr, level);
+        pte = &parent[offset];
+
+        if ( dma_pte_addr(*pte) == 0 )
+        {
+            if ( !alloc )
+                break;
+            maddr = alloc_pgtable_maddr(domain);
+            if ( !maddr )
+                break;
+            dma_set_pte_addr(*pte, maddr);
+            vaddr = map_vtd_domain_page(maddr);
+
+            /*
+             * high level table always sets r/w, last level
+             * page table control read/write
+             */
+            dma_set_pte_readable(*pte);
+            dma_set_pte_writable(*pte);
+            iommu_flush_cache_entry(pte);
+        }
+        else
+        {
+            vaddr = map_vtd_domain_page(pte->val);
+        }
+
+        if ( level == 2 )
+        {
+            pte_maddr = pte->val & PAGE_MASK_4K;
+            unmap_vtd_domain_page(vaddr);
+            break;
+        }
+
+        unmap_vtd_domain_page(parent);
+        parent = (struct dma_pte *)vaddr;
+        vaddr = NULL;
+        level--;
+    }
+
+    unmap_vtd_domain_page(parent);
+ out:
+    return pte_maddr;
+}
+
+static void iommu_flush_write_buffer(struct iommu *iommu)
+{
+    u32 val;
+    unsigned long flag;
+    s_time_t start_time;
+
+    if ( !cap_rwbf(iommu->cap) )
+        return;
+    val = iommu->gcmd | DMA_GCMD_WBF;
+
+    spin_lock_irqsave(&iommu->register_lock, flag);
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, val);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( !(val & DMA_GSTS_WBFS) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static int flush_context_reg(
+    void *_iommu,
+    u16 did, u16 source_id, u8 function_mask, u64 type,
+    int non_present_entry_flush)
+{
+    struct iommu *iommu = (struct iommu *) _iommu;
+    u64 val = 0;
+    unsigned long flag;
+    s_time_t start_time;
+
+    /*
+     * In the non-present entry flush case, if hardware doesn't cache
+     * non-present entry we do nothing and if hardware cache non-present
+     * entry, we flush entries of domain 0 (the domain id is used to cache
+     * any non-present entries)
+     */
+    if ( non_present_entry_flush )
+    {
+        if ( !cap_caching_mode(iommu->cap) )
+            return 1;
+        else
+            did = 0;
+    }
+
+    /* use register invalidation */
+    switch ( type )
+    {
+    case DMA_CCMD_GLOBAL_INVL:
+        val = DMA_CCMD_GLOBAL_INVL;
+        break;
+    case DMA_CCMD_DOMAIN_INVL:
+        val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+        break;
+    case DMA_CCMD_DEVICE_INVL:
+        val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+            |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
+        break;
+    default:
+        BUG();
+    }
+    val |= DMA_CCMD_ICC;
+
+    spin_lock_irqsave(&iommu->register_lock, flag);
+    dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        val = dmar_readq(iommu->reg, DMAR_CCMD_REG);
+        if ( !(val & DMA_CCMD_ICC) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flag);
+    /* flush context entry will implicitly flush write buffer */
+    return 0;
+}
+
+static int inline iommu_flush_context_global(
+    struct iommu *iommu, int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+                                 non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_domain(
+    struct iommu *iommu, u16 did, int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
+                                 non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_device(
+    struct iommu *iommu, u16 did, u16 source_id,
+    u8 function_mask, int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->context(iommu, did, source_id, function_mask,
+                                 DMA_CCMD_DEVICE_INVL,
+                                 non_present_entry_flush);
+}
+
+/* return value determine if we need a write buffer flush */
+static int flush_iotlb_reg(void *_iommu, u16 did,
+                               u64 addr, unsigned int size_order, u64 type,
+                               int non_present_entry_flush)
+{
+    struct iommu *iommu = (struct iommu *) _iommu;
+    int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+    u64 val = 0, val_iva = 0;
+    unsigned long flag;
+    s_time_t start_time;
+
+    /*
+     * In the non-present entry flush case, if hardware doesn't cache
+     * non-present entry we do nothing and if hardware cache non-present
+     * entry, we flush entries of domain 0 (the domain id is used to cache
+     * any non-present entries)
+     */
+    if ( non_present_entry_flush )
+    {
+        if ( !cap_caching_mode(iommu->cap) )
+            return 1;
+        else
+            did = 0;
+    }
+
+    /* use register invalidation */
+    switch ( type )
+    {
+    case DMA_TLB_GLOBAL_FLUSH:
+        /* global flush doesn't need set IVA_REG */
+        val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+        break;
+    case DMA_TLB_DSI_FLUSH:
+        val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+        break;
+    case DMA_TLB_PSI_FLUSH:
+        val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+        /* Note: always flush non-leaf currently */
+        val_iva = size_order | addr;
+        break;
+    default:
+        BUG();
+    }
+    /* Note: set drain read/write */
+    if ( cap_read_drain(iommu->cap) )
+        val |= DMA_TLB_READ_DRAIN;
+    if ( cap_write_drain(iommu->cap) )
+        val |= DMA_TLB_WRITE_DRAIN;
+
+    spin_lock_irqsave(&iommu->register_lock, flag);
+    /* Note: Only uses first TLB reg currently */
+    if ( val_iva )
+        dmar_writeq(iommu->reg, tlb_offset, val_iva);
+    dmar_writeq(iommu->reg, tlb_offset + 8, val);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        val = dmar_readq(iommu->reg, tlb_offset + 8);
+        if ( !(val & DMA_TLB_IVT) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+    /* check IOTLB invalidation granularity */
+    if ( DMA_TLB_IAIG(val) == 0 )
+        dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
+
+    /* flush iotlb entry will implicitly flush write buffer */
+    return 0;
+}
+
+static int inline iommu_flush_iotlb_global(struct iommu *iommu,
+                                           int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+                               non_present_entry_flush);
+}
+
+static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
+                                        int non_present_entry_flush)
+{
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+    return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
+                               non_present_entry_flush);
+}
+
+static int inline get_alignment(u64 base, unsigned int size)
+{
+    int t = 0;
+    u64 end;
+
+    end = base + size - 1;
+    while ( base != end )
+    {
+        t++;
+        base >>= 1;
+        end >>= 1;
+    }
+    return t;
+}
+
+static int inline iommu_flush_iotlb_psi(
+    struct iommu *iommu, u16 did,
+    u64 addr, unsigned int pages, int non_present_entry_flush)
+{
+    unsigned int align;
+    struct iommu_flush *flush = iommu_get_flush(iommu);
+
+    ASSERT(!(addr & (~PAGE_MASK_4K)));
+    ASSERT(pages > 0);
+
+    /* Fallback to domain selective flush if no PSI support */
+    if ( !cap_pgsel_inv(iommu->cap) )
+        return iommu_flush_iotlb_dsi(iommu, did,
+                                     non_present_entry_flush);
+
+    /*
+     * PSI requires page size is 2 ^ x, and the base address is naturally
+     * aligned to the size
+     */
+    align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
+    /* Fallback to domain selective flush if size is too big */
+    if ( align > cap_max_amask_val(iommu->cap) )
+        return iommu_flush_iotlb_dsi(iommu, did,
+                                     non_present_entry_flush);
+
+    addr >>= PAGE_SHIFT_4K + align;
+    addr <<= PAGE_SHIFT_4K + align;
+
+    return flush->iotlb(iommu, did, addr, align,
+                               DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+}
+
+void iommu_flush_all(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    flush_all_cache();
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        iommu_flush_context_global(iommu, 0);
+        iommu_flush_iotlb_global(iommu, 0);
+    }
+}
+
+/* clear one page's page table */
+static void dma_pte_clear_one(struct domain *domain, u64 addr)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *page = NULL, *pte = NULL;
+    u64 pg_maddr;
+
+    spin_lock(&hd->mapping_lock);
+    /* get last level pte */
+    pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
+    if ( pg_maddr == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
+        return;
+    }
+
+    page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
+    pte = page + address_level_offset(addr, 1);
+
+    if ( !dma_pte_present(*pte) )
+    {
+        spin_unlock(&hd->mapping_lock);
+        unmap_vtd_domain_page(page);
+        return;
+    }
+
+    dma_clear_pte(*pte);
+    spin_unlock(&hd->mapping_lock);
+    iommu_flush_cache_entry(pte);
+
+    /* No need pcidevs_lock here since do that on assign/deassign device*/
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        if ( test_bit(iommu->index, &hd->iommu_bitmap) )
+            if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
+                                       addr, 1, 0))
+                iommu_flush_write_buffer(iommu);
+    }
+
+    unmap_vtd_domain_page(page);
+}
+
+static void iommu_free_pagetable(u64 pt_maddr, int level)
+{
+    int i;
+    struct dma_pte *pt_vaddr, *pte;
+    int next_level = level - 1;
+
+    if ( pt_maddr == 0 )
+        return;
+
+    pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
+
+    for ( i = 0; i < PTE_NUM; i++ )
+    {
+        pte = &pt_vaddr[i];
+        if ( !dma_pte_present(*pte) )
+            continue;
+
+        if ( next_level >= 1 )
+            iommu_free_pagetable(dma_pte_addr(*pte), next_level);
+
+        dma_clear_pte(*pte);
+        iommu_flush_cache_entry(pte);
+    }
+
+    unmap_vtd_domain_page(pt_vaddr);
+    free_pgtable_maddr(pt_maddr);
+}
+
+static int iommu_set_root_entry(struct iommu *iommu)
+{
+    u32 cmd, sts;
+    unsigned long flags;
+    s_time_t start_time;
+
+    spin_lock(&iommu->lock);
+
+    if ( iommu->root_maddr == 0 )
+        iommu->root_maddr = alloc_pgtable_maddr(NULL);
+    if ( iommu->root_maddr == 0 )
+    {
+        spin_unlock(&iommu->lock);
+        return -ENOMEM;
+    }
+
+    spin_unlock(&iommu->lock);
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
+    cmd = iommu->gcmd | DMA_GCMD_SRTP;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( sts & DMA_GSTS_RTPS )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+    return 0;
+}
+
+static void iommu_enable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+    s_time_t start_time;
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+            "iommu_enable_translation: iommu->reg = %p\n", iommu->reg);
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd |= DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( sts & DMA_GSTS_TES )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+
+    /* Disable PMRs when VT-d engine takes effect per spec definition */
+    disable_pmr(iommu);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+int iommu_disable_translation(struct iommu *iommu)
+{
+    u32 sts;
+    unsigned long flags;
+    s_time_t start_time;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    iommu->gcmd &= ~ DMA_GCMD_TE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    for ( ; ; )
+    {
+        sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
+        if ( !(sts & DMA_GSTS_TES) )
+            break;
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("%s: DMAR hardware is malfunctional,"
+                  " please disable IOMMU\n", __func__);
+        cpu_relax();
+    }
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+    return 0;
+}
+
+static struct iommu *vector_to_iommu[NR_VECTORS];
+static int iommu_page_fault_do_one(struct iommu *iommu, int type,
+                                   u8 fault_reason, u16 source_id, u64 addr)
+{
+    dprintk(XENLOG_WARNING VTDPREFIX,
+            "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x "
+            "iommu->reg = %p\n",
+            (type ? "DMA Read" : "DMA Write"), (source_id >> 8),
+            PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr,
+            fault_reason, iommu->reg);
+
+#ifndef __i386__ /* map_domain_page() cannot be used in this context */
+    if ( fault_reason < 0x20 )
+        print_vtd_entries(iommu, (source_id >> 8),
+                          (source_id & 0xff), (addr >> PAGE_SHIFT));
+#endif
+
+    return 0;
+}
+
+static void iommu_fault_status(u32 fault_status)
+{
+    if ( fault_status & DMA_FSTS_PFO )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Fault Overflow\n");
+    if ( fault_status & DMA_FSTS_PPF )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Primary Pending Fault\n");
+    if ( fault_status & DMA_FSTS_AFO )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Advanced Fault Overflow\n");
+    if ( fault_status & DMA_FSTS_APF )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Advanced Pending Fault\n");
+    if ( fault_status & DMA_FSTS_IQE )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Invalidation Queue Error\n");
+    if ( fault_status & DMA_FSTS_ICE )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Invalidation Completion Error\n");
+    if ( fault_status & DMA_FSTS_ITE )
+        dprintk(XENLOG_ERR VTDPREFIX,
+            "iommu_fault_status: Invalidation Time-out Error\n");
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+static void iommu_page_fault(int vector, void *dev_id,
+                             struct cpu_user_regs *regs)
+{
+    struct iommu *iommu = dev_id;
+    int reg, fault_index;
+    u32 fault_status;
+    unsigned long flags;
+
+    dprintk(XENLOG_WARNING VTDPREFIX,
+            "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
+
+    fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
+
+    iommu_fault_status(fault_status);
+
+    /* FIXME: ignore advanced fault log */
+    if ( !(fault_status & DMA_FSTS_PPF) )
+        goto clear_overflow;
+
+    fault_index = dma_fsts_fault_record_index(fault_status);
+    reg = cap_fault_reg_offset(iommu->cap);
+    while (1)
+    {
+        u8 fault_reason;
+        u16 source_id;
+        u32 data;
+        u64 guest_addr;
+        int type;
+
+        /* highest 32 bits */
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        data = dmar_readl(iommu->reg, reg +
+                          fault_index * PRIMARY_FAULT_REG_LEN + 12);
+        if ( !(data & DMA_FRCD_F) )
+        {
+            spin_unlock_irqrestore(&iommu->register_lock, flags);
+            break;
+        }
+
+        fault_reason = dma_frcd_fault_reason(data);
+        type = dma_frcd_type(data);
+
+        data = dmar_readl(iommu->reg, reg +
+                          fault_index * PRIMARY_FAULT_REG_LEN + 8);
+        source_id = dma_frcd_source_id(data);
+
+        guest_addr = dmar_readq(iommu->reg, reg +
+                                fault_index * PRIMARY_FAULT_REG_LEN);
+        guest_addr = dma_frcd_page_addr(guest_addr);
+        /* clear the fault */
+        dmar_writel(iommu->reg, reg +
+                    fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+        iommu_page_fault_do_one(iommu, type, fault_reason,
+                                source_id, guest_addr);
+
+        fault_index++;
+        if ( fault_index > cap_num_fault_regs(iommu->cap) )
+            fault_index = 0;
+    }
+clear_overflow:
+    /* clear primary fault overflow */
+    fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+    if ( fault_status & DMA_FSTS_PFO )
+    {
+        spin_lock_irqsave(&iommu->register_lock, flags);
+        dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
+        spin_unlock_irqrestore(&iommu->register_lock, flags);
+    }
+}
+
+static void dma_msi_unmask(unsigned int vector)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    unsigned long flags;
+
+    /* unmask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void dma_msi_mask(unsigned int vector)
+{
+    unsigned long flags;
+    struct iommu *iommu = vector_to_iommu[vector];
+
+    /* mask it */
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static unsigned int dma_msi_startup(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    return 0;
+}
+
+static void dma_msi_end(unsigned int vector)
+{
+    dma_msi_unmask(vector);
+    ack_APIC_irq();
+}
+
+static void dma_msi_data_init(struct iommu *iommu, int vector)
+{
+    u32 msi_data = 0;
+    unsigned long flags;
+
+    /* Fixed, edge, assert mode. Follow MSI setting */
+    msi_data |= vector & 0xff;
+    msi_data |= 1 << 14;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data);
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+#ifdef SUPPORT_MSI_REMAPPING
+static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
+{
+    u64 msi_address;
+    unsigned long flags;
+
+    /* Physical, dedicated cpu. Follow MSI setting */
+    msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8));
+    msi_address |= MSI_PHYSICAL_MODE << 2;
+    msi_address |= MSI_REDIRECTION_HINT_MODE << 3;
+    msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT;
+
+    spin_lock_irqsave(&iommu->register_lock, flags);
+    dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address);
+    dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32));
+    spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+#else
+static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu)
+{
+    /* ia64: TODO */
+}
+#endif
+
+static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest)
+{
+    struct iommu *iommu = vector_to_iommu[vector];
+    dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest)));
+}
+
+static struct hw_interrupt_type dma_msi_type = {
+    .typename = "DMA_MSI",
+    .startup = dma_msi_startup,
+    .shutdown = dma_msi_mask,
+    .enable = dma_msi_unmask,
+    .disable = dma_msi_mask,
+    .ack = dma_msi_mask,
+    .end = dma_msi_end,
+    .set_affinity = dma_msi_set_affinity,
+};
+
+int iommu_set_interrupt(struct iommu *iommu)
+{
+    int vector, ret;
+
+    vector = assign_irq_vector(AUTO_ASSIGN);
+    vector_to_iommu[vector] = iommu;
+
+    /* VT-d fault is a MSI, make irq == vector */
+    irq_vector[vector] = vector;
+    vector_irq[vector] = vector;
+
+    if ( !vector )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
+        return -EINVAL;
+    }
+
+    irq_desc[vector].handler = &dma_msi_type;
+    ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
+    if ( ret )
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
+    return vector;
+}
+
+static int iommu_alloc(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu;
+    unsigned long sagaw;
+    int agaw;
+
+    if ( nr_iommus > MAX_IOMMUS )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
+        return -ENOMEM;
+    }
+
+    iommu = xmalloc(struct iommu);
+    if ( iommu == NULL )
+        return -ENOMEM;
+    memset(iommu, 0, sizeof(struct iommu));
+
+    iommu->intel = alloc_intel_iommu();
+    if ( iommu->intel == NULL )
+    {
+        xfree(iommu);
+        return -ENOMEM;
+    }
+
+    iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address);
+    iommu->index = nr_iommus++;
+
+    iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
+    iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
+
+    /* Calculate number of pagetable levels: between 2 and 4. */
+    sagaw = cap_sagaw(iommu->cap);
+    for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
+        if ( test_bit(agaw, &sagaw) )
+            break;
+    if ( agaw < 0 )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "IOMMU: unsupported sagaw %lx\n", sagaw);
+        xfree(iommu);
+        return -ENODEV;
+    }
+    iommu->nr_pt_levels = agaw_to_level(agaw);
+
+    if ( !ecap_coherent(iommu->ecap) )
+        iommus_incoherent = 1;
+
+    spin_lock_init(&iommu->lock);
+    spin_lock_init(&iommu->register_lock);
+
+    drhd->iommu = iommu;
+    return 0;
+}
+
+static void iommu_free(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+
+    if ( iommu == NULL )
+        return;
+
+    if ( iommu->root_maddr != 0 )
+    {
+        free_pgtable_maddr(iommu->root_maddr);
+        iommu->root_maddr = 0;
+    }
+
+    if ( iommu->reg )
+        iounmap(iommu->reg);
+
+    free_intel_iommu(iommu->intel);
+    free_irq(iommu->vector);
+    xfree(iommu);
+
+    drhd->iommu = NULL;
+}
+
+#define guestwidth_to_adjustwidth(gaw) ({       \
+    int agaw, r = (gaw - 12) % 9;               \
+    agaw = (r == 0) ? gaw : (gaw + 9 - r);      \
+    if ( agaw > 64 )                            \
+        agaw = 64;                              \
+    agaw; })
+
+static int intel_iommu_domain_init(struct domain *d)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    u64 i, j, tmp;
+    struct acpi_drhd_unit *drhd;
+
+    hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+
+    if ( d->domain_id == 0 )
+    {
+        extern int xen_in_range(paddr_t start, paddr_t end);
+
+        /*
+         * Set up 1:1 page table for dom0 except the critical segments
+         * like Xen.
+         */
+        for ( i = 0; i < max_page; i++ )
+        {
+            if ( xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) )
+                continue;
+
+            tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K);
+            for ( j = 0; j < tmp; j++ )
+                iommu_map_page(d, (i*tmp+j), (i*tmp+j));
+        }
+
+        setup_dom0_devices(d);
+        setup_dom0_rmrr(d);
+
+        iommu_flush_all();
+
+        for_each_drhd_unit ( drhd )
+            iommu_enable_translation(drhd->iommu);
+    }
+
+    return 0;
+}
+
+static int domain_context_mapping_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    struct context_entry *context, *context_entries;
+    u64 maddr, pgd_maddr;
+    struct pci_dev *pdev = NULL;
+    int agaw;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    spin_lock(&iommu->lock);
+    maddr = bus_to_context_maddr(iommu, bus);
+    context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
+    context = &context_entries[devfn];
+
+    if ( context_present(*context) )
+    {
+        int res = 0;
+
+        pdev = pci_get_pdev(bus, devfn);
+        if (!pdev)
+            res = -ENODEV;
+        else if (pdev->domain != domain)
+            res = -EINVAL;
+        unmap_vtd_domain_page(context_entries);
+        spin_unlock(&iommu->lock);
+        return res;
+    }
+
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+    {
+        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+        agaw = level_to_agaw(iommu->nr_pt_levels);
+    }
+    else
+    {
+        spin_lock(&hd->mapping_lock);
+
+        /* Ensure we have pagetables allocated down to leaf PTE. */
+        if ( hd->pgd_maddr == 0 )
+        {
+            addr_to_dma_page_maddr(domain, 0, 1);
+            if ( hd->pgd_maddr == 0 )
+            {
+            nomem:
+                spin_unlock(&hd->mapping_lock);
+                spin_unlock(&iommu->lock);
+                unmap_vtd_domain_page(context_entries);
+                return -ENOMEM;
+            }
+        }
+
+        /* Skip top levels of page tables for 2- and 3-level DRHDs. */
+        pgd_maddr = hd->pgd_maddr;
+        for ( agaw = level_to_agaw(4);
+              agaw != level_to_agaw(iommu->nr_pt_levels);
+              agaw-- )
+        {
+            struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
+            pgd_maddr = dma_pte_addr(*p);
+            unmap_vtd_domain_page(p);
+            if ( pgd_maddr == 0 )
+                goto nomem;
+        }
+
+        context_set_address_root(*context, pgd_maddr);
+        context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+        spin_unlock(&hd->mapping_lock);
+    }
+
+    /*
+     * domain_id 0 is not valid on Intel's IOMMU, force domain_id to
+     * be 1 based as required by intel's iommu hw.
+     */
+    context_set_domain_id(context, domain);
+    context_set_address_width(*context, agaw);
+    context_set_fault_enable(*context);
+    context_set_present(*context);
+    iommu_flush_cache_entry(context);
+    spin_unlock(&iommu->lock);
+
+    /* Context entry was previously non-present (with domid 0). */
+    if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
+                                    DMA_CCMD_MASK_NOBIT, 1) )
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, 0, 1);
+
+    set_bit(iommu->index, &hd->iommu_bitmap);
+
+    unmap_vtd_domain_page(context_entries);
+
+    return 0;
+}
+
+#define PCI_BASE_CLASS_BRIDGE    0x06
+#define PCI_CLASS_BRIDGE_PCI     0x0604
+
+enum {
+    DEV_TYPE_PCIe_ENDPOINT,
+    DEV_TYPE_PCIe_BRIDGE,    // PCIe root port, switch
+    DEV_TYPE_PCI_BRIDGE,     // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
+    DEV_TYPE_PCI,
+};
+
+int pdev_type(u8 bus, u8 devfn)
+{
+    u16 class_device;
+    u16 status, creg;
+    int pos;
+    u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
+
+    class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
+    if ( class_device == PCI_CLASS_BRIDGE_PCI )
+    {
+        pos = pci_find_next_cap(bus, devfn,
+                                PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
+        if ( !pos )
+            return DEV_TYPE_PCI_BRIDGE;
+        creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
+        return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ?
+            DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE;
+    }
+
+    status = pci_conf_read16(bus, d, f, PCI_STATUS);
+    if ( !(status & PCI_STATUS_CAP_LIST) )
+        return DEV_TYPE_PCI;
+
+    if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) )
+        return DEV_TYPE_PCIe_ENDPOINT;
+
+    return DEV_TYPE_PCI;
+}
+
+#define MAX_BUSES 256
+static DEFINE_SPINLOCK(bus2bridge_lock);
+static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
+
+static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
+{
+    int cnt = 0;
+    *secbus = *bus;
+
+    ASSERT(spin_is_locked(&bus2bridge_lock));
+    if ( !bus2bridge[*bus].map )
+        return 0;
+
+    while ( bus2bridge[*bus].map )
+    {
+        *secbus = *bus;
+        *devfn = bus2bridge[*bus].devfn;
+        *bus = bus2bridge[*bus].bus;
+        if ( cnt++ >= MAX_BUSES )
+            return 0;
+    }
+
+    return 1;
+}
+
+static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
+{
+    int ret = 0;
+
+    if ( *bus == 0 )
+        /* assume integrated PCI devices in RC have valid requester-id */
+        return 1;
+
+    spin_lock(&bus2bridge_lock);
+    ret = _find_pcie_endpoint(bus, devfn, secbus);
+    spin_unlock(&bus2bridge_lock);
+
+    return ret;
+}
+
+static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
+{
+    struct acpi_drhd_unit *drhd;
+    int ret = 0;
+    u16 sec_bus, sub_bus;
+    u32 type;
+    u8 secbus;
+
+    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    if ( !drhd )
+        return -ENODEV;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    type = pdev_type(bus, devfn);
+    switch ( type )
+    {
+    case DEV_TYPE_PCIe_BRIDGE:
+        break;
+
+    case DEV_TYPE_PCI_BRIDGE:
+        sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                                 PCI_SECONDARY_BUS);
+        sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                                 PCI_SUBORDINATE_BUS);
+
+        spin_lock(&bus2bridge_lock);
+        for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
+        {
+            bus2bridge[sec_bus].map = 1;
+            bus2bridge[sec_bus].bus =  bus;
+            bus2bridge[sec_bus].devfn =  devfn;
+        }
+        spin_unlock(&bus2bridge_lock);
+        break;
+
+    case DEV_TYPE_PCIe_ENDPOINT:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_mapping:PCIe: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+        break;
+
+    case DEV_TYPE_PCI:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+        if ( ret )
+           break;
+
+        secbus = bus;
+        /* dependent devices mapping */
+        while ( bus2bridge[bus].map )
+        {
+            secbus = bus;
+            devfn = bus2bridge[bus].devfn;
+            bus = bus2bridge[bus].bus;
+            ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+            if ( ret )
+                return ret;
+        }
+
+        if ( secbus != bus )
+            /*
+             * The source-id for transactions on non-PCIe buses seem
+             * to originate from devfn=0 on the secondary bus behind
+             * the bridge.  Map that id as well.  The id to use in
+             * these scanarios is not particularly well documented
+             * anywhere.
+             */
+            ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
+        break;
+
+    default:
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "domain_context_mapping:unknown type : bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+static int domain_context_unmap_one(
+    struct domain *domain,
+    struct iommu *iommu,
+    u8 bus, u8 devfn)
+{
+    struct context_entry *context, *context_entries;
+    u64 maddr;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    spin_lock(&iommu->lock);
+
+    maddr = bus_to_context_maddr(iommu, bus);
+    context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
+    context = &context_entries[devfn];
+
+    if ( !context_present(*context) )
+    {
+        spin_unlock(&iommu->lock);
+        unmap_vtd_domain_page(context_entries);
+        return 0;
+    }
+
+    context_clear_present(*context);
+    context_clear_entry(*context);
+    iommu_flush_cache_entry(context);
+
+    if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
+                                    (((u16)bus) << 8) | devfn,
+                                    DMA_CCMD_MASK_NOBIT, 0) )
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
+
+    spin_unlock(&iommu->lock);
+    unmap_vtd_domain_page(context_entries);
+
+    return 0;
+}
+
+static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
+{
+    struct acpi_drhd_unit *drhd;
+    int ret = 0;
+    u32 type;
+    u8 secbus;
+
+    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    if ( !drhd )
+        return -ENODEV;
+
+    type = pdev_type(bus, devfn);
+    switch ( type )
+    {
+    case DEV_TYPE_PCIe_BRIDGE:
+    case DEV_TYPE_PCI_BRIDGE:
+        break;
+
+    case DEV_TYPE_PCIe_ENDPOINT:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+        break;
+
+    case DEV_TYPE_PCI:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+        if ( ret )
+            break;
+
+        secbus = bus;
+        /* dependent devices unmapping */
+        while ( bus2bridge[bus].map )
+        {
+            secbus = bus;
+            devfn = bus2bridge[bus].devfn;
+            bus = bus2bridge[bus].bus;
+            ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+            if ( ret )
+                return ret;
+        }
+
+        if ( bus != secbus )
+            ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
+        break;
+
+    default:
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "domain_context_unmap:unknown type: bdf = %x:%x:%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+static int reassign_device_ownership(
+    struct domain *source,
+    struct domain *target,
+    u8 bus, u8 devfn)
+{
+    struct hvm_iommu *source_hd = domain_hvm_iommu(source);
+    struct pci_dev *pdev;
+    struct acpi_drhd_unit *drhd;
+    struct iommu *pdev_iommu;
+    int ret, found = 0;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev_by_domain(source, bus, devfn);
+
+    if (!pdev)
+        return -ENODEV;
+
+    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    pdev_iommu = drhd->iommu;
+    domain_context_unmap(source, bus, devfn);
+
+    ret = domain_context_mapping(target, bus, devfn);
+    if ( ret )
+        return ret;
+
+    list_move(&pdev->domain_list, &target->arch.pdev_list);
+    pdev->domain = target;
+
+    for_each_pdev ( source, pdev )
+    {
+        drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+        if ( drhd->iommu == pdev_iommu )
+        {
+            found = 1;
+            break;
+        }
+    }
+
+    if ( !found )
+        clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
+
+    return ret;
+}
+
+void iommu_domain_teardown(struct domain *d)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    if ( list_empty(&acpi_drhd_units) )
+        return;
+
+    spin_lock(&hd->mapping_lock);
+    iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
+    hd->pgd_maddr = 0;
+    spin_unlock(&hd->mapping_lock);
+
+    iommu_domid_release(d);
+}
+
+int intel_iommu_map_page(
+    struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct dma_pte *page = NULL, *pte = NULL;
+    u64 pg_maddr;
+    int pte_present;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+        return 0;
+
+    spin_lock(&hd->mapping_lock);
+
+    pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
+    if ( pg_maddr == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
+        return -ENOMEM;
+    }
+    page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
+    pte = page + (gfn & LEVEL_MASK);
+    pte_present = dma_pte_present(*pte);
+    dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
+    dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
+    iommu_flush_cache_entry(pte);
+    spin_unlock(&hd->mapping_lock);
+    unmap_vtd_domain_page(page);
+
+    /*
+     * No need pcideves_lock here because we have flush
+     * when assign/deassign device
+     */
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+
+        if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
+            continue;
+
+        if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d),
+                                   (paddr_t)gfn << PAGE_SHIFT_4K, 1,
+                                   !pte_present) )
+            iommu_flush_write_buffer(iommu);
+    }
+
+    return 0;
+}
+
+int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* do nothing if dom0 and iommu supports pass thru */
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+        return 0;
+
+    dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
+
+    return 0;
+}
+
+static int iommu_prepare_rmrr_dev(struct domain *d,
+                                  struct acpi_rmrr_unit *rmrr,
+                                  u8 bus, u8 devfn)
+{
+    int ret = 0;
+    u64 base, end;
+    unsigned long base_pfn, end_pfn;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(rmrr->base_address < rmrr->end_address);
+
+    base = rmrr->base_address & PAGE_MASK_4K;
+    base_pfn = base >> PAGE_SHIFT_4K;
+    end = PAGE_ALIGN_4K(rmrr->end_address);
+    end_pfn = end >> PAGE_SHIFT_4K;
+
+    while ( base_pfn < end_pfn )
+    {
+        intel_iommu_map_page(d, base_pfn, base_pfn);
+        base_pfn++;
+    }
+
+    ret = domain_context_mapping(d, bus, devfn);
+
+    return ret;
+}
+
+static int intel_iommu_add_device(struct pci_dev *pdev)
+{
+    struct acpi_rmrr_unit *rmrr;
+    u16 bdf;
+    int ret, i;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn);
+    if ( ret )
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                 "intel_iommu_add_device: context mapping failed\n");
+        return ret;
+    }
+
+    for_each_rmrr_device ( rmrr, bdf, i )
+    {
+        if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn )
+        {
+            ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr,
+                                         pdev->bus, pdev->devfn);
+            if ( ret )
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                         "intel_iommu_add_device: RMRR mapping failed\n");
+            break;
+        }
+    }
+
+    return ret;
+}
+
+static int intel_iommu_remove_device(struct pci_dev *pdev)
+{
+    struct acpi_rmrr_unit *rmrr;
+    u16 bdf;
+    int i;
+
+    if ( !pdev->domain )
+        return -EINVAL;
+
+    /* If the device belongs to dom0, and it has RMRR, don't remove it
+     * from dom0, because BIOS may use RMRR at booting time.
+     */
+    if ( pdev->domain->domain_id == 0 )
+    {
+        for_each_rmrr_device ( rmrr, bdf, i )
+        {
+            if ( PCI_BUS(bdf) == pdev->bus &&
+                 PCI_DEVFN2(bdf) == pdev->devfn )
+                return 0;
+        }
+    }
+
+    return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn);
+}
+
+static void setup_dom0_devices(struct domain *d)
+{
+    struct hvm_iommu *hd;
+    struct pci_dev *pdev;
+    int bus, dev, func;
+    u32 l;
+
+    hd = domain_hvm_iommu(d);
+
+    spin_lock(&pcidevs_lock);
+    for ( bus = 0; bus < 256; bus++ )
+    {
+        for ( dev = 0; dev < 32; dev++ )
+        {
+            for ( func = 0; func < 8; func++ )
+            {
+                l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID);
+                /* some broken boards return 0 or ~0 if a slot is empty: */
+                if ( (l == 0xffffffff) || (l == 0x00000000) ||
+                     (l == 0x0000ffff) || (l == 0xffff0000) )
+                    continue;
+
+                pdev = alloc_pdev(bus, PCI_DEVFN(dev, func));
+                pdev->domain = d;
+                list_add(&pdev->domain_list, &d->arch.pdev_list);
+                domain_context_mapping(d, pdev->bus, pdev->devfn);
+            }
+        }
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+void clear_fault_bits(struct iommu *iommu)
+{
+    u64 val;
+
+    val = dmar_readq(
+        iommu->reg,
+        cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8);
+    dmar_writeq(
+        iommu->reg,
+        cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8,
+        val);
+    dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
+}
+
+static int init_vtd_hw(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    struct iommu_flush *flush = NULL;
+    int vector;
+    int ret;
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        ret = iommu_set_root_entry(iommu);
+        if ( ret )
+        {
+            gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
+            return -EIO;
+        }
+
+        vector = iommu_set_interrupt(iommu);
+        dma_msi_data_init(iommu, vector);
+        dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
+        iommu->vector = vector;
+        clear_fault_bits(iommu);
+        dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+
+        /* initialize flush functions */
+        flush = iommu_get_flush(iommu);
+        flush->context = flush_context_reg;
+        flush->iotlb = flush_iotlb_reg;
+    }
+
+    return 0;
+}
+
+static void setup_dom0_rmrr(struct domain *d)
+{
+    struct acpi_rmrr_unit *rmrr;
+    u16 bdf;
+    int ret, i;
+
+    spin_lock(&pcidevs_lock);
+    for_each_rmrr_device ( rmrr, bdf, i )
+    {
+        ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+        if ( ret )
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                     "IOMMU: mapping reserved region failed\n");
+    }
+    spin_unlock(&pcidevs_lock);
+}
+
+int intel_vtd_setup(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+
+    if ( !vtd_enabled )
+        return -ENODEV;
+
+    spin_lock_init(&domid_bitmap_lock);
+    clflush_size = get_cache_line_size();
+
+    for_each_drhd_unit ( drhd )
+        if ( iommu_alloc(drhd) != 0 )
+            goto error;
+
+    /* Allocate IO page directory page for the domain. */
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
+    /* Allocate domain id bitmap, and set bit 0 as reserved */
+    domid_bitmap_size = cap_ndoms(iommu->cap);
+    domid_bitmap = xmalloc_array(unsigned long,
+                                 BITS_TO_LONGS(domid_bitmap_size));
+    if ( domid_bitmap == NULL )
+        goto error;
+    memset(domid_bitmap, 0, domid_bitmap_size / 8);
+    set_bit(0, domid_bitmap);
+
+    if ( init_vtd_hw() )
+        goto error;
+
+    return 0;
+
+ error:
+    for_each_drhd_unit ( drhd )
+        iommu_free(drhd);
+    vtd_enabled = 0;
+    return -ENOMEM;
+}
+
+/*
+ * If the device isn't owned by dom0, it means it already
+ * has been assigned to other domain, or it's not exist.
+ */
+int device_assigned(u8 bus, u8 devfn)
+{
+    struct pci_dev *pdev;
+
+    spin_lock(&pcidevs_lock);
+    pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
+    if (!pdev)
+    {
+        spin_unlock(&pcidevs_lock);
+        return -1;
+    }
+
+    spin_unlock(&pcidevs_lock);
+    return 0;
+}
+
+int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
+{
+    struct acpi_rmrr_unit *rmrr;
+    int ret = 0, i;
+    struct pci_dev *pdev;
+    u16 bdf;
+
+    if ( list_empty(&acpi_drhd_units) )
+        return -ENODEV;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(bus, devfn);
+    if (!pdev)
+        return -ENODEV;
+
+    if (pdev->domain != dom0)
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: assign a assigned device\n");
+       return -EBUSY;
+    }
+
+    ret = reassign_device_ownership(dom0, d, bus, devfn);
+    if ( ret )
+        goto done;
+
+    /* Setup rmrr identity mapping */
+    for_each_rmrr_device( rmrr, bdf, i )
+    {
+        if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn )
+        {
+            /* FIXME: Because USB RMRR conflicts with guest bios region,
+             * ignore USB RMRR temporarily.
+             */
+            if ( is_usb_device(bus, devfn) )
+            {
+                ret = 0;
+                goto done;
+            }
+
+            ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
+            if ( ret )
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                         "IOMMU: mapping reserved region failed\n");
+            goto done;
+        }
+    }
+
+done:
+    return ret;
+}
+
+static int intel_iommu_group_id(u8 bus, u8 devfn)
+{
+    u8 secbus;
+    if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) )
+        return PCI_BDF2(bus, devfn);
+    else
+        return -1;
+}
+
+static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
+void iommu_suspend(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    u32    i;
+
+    if ( !vtd_enabled )
+        return;
+
+    iommu_flush_all();
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        i = iommu->index;
+
+        iommu_state[i][DMAR_FECTL_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
+        iommu_state[i][DMAR_FEDATA_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
+        iommu_state[i][DMAR_FEADDR_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
+        iommu_state[i][DMAR_FEUADDR_REG] =
+            (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
+    }
+}
+
+void iommu_resume(void)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu;
+    u32 i;
+
+    if ( !vtd_enabled )
+        return;
+
+    iommu_flush_all();
+
+    if ( init_vtd_hw() != 0  && force_iommu )
+         panic("IOMMU setup failed, crash Xen for security purpose!\n");
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        i = iommu->index;
+
+        dmar_writel(iommu->reg, DMAR_FECTL_REG,
+                    (u32) iommu_state[i][DMAR_FECTL_REG]);
+        dmar_writel(iommu->reg, DMAR_FEDATA_REG,
+                    (u32) iommu_state[i][DMAR_FEDATA_REG]);
+        dmar_writel(iommu->reg, DMAR_FEADDR_REG,
+                    (u32) iommu_state[i][DMAR_FEADDR_REG]);
+        dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
+                    (u32) iommu_state[i][DMAR_FEUADDR_REG]);
+        iommu_enable_translation(iommu);
+    }
+}
+
+struct iommu_ops intel_iommu_ops = {
+    .init = intel_iommu_domain_init,
+    .add_device = intel_iommu_add_device,
+    .remove_device = intel_iommu_remove_device,
+    .assign_device  = intel_iommu_assign_device,
+    .teardown = iommu_domain_teardown,
+    .map_page = intel_iommu_map_page,
+    .unmap_page = intel_iommu_unmap_page,
+    .reassign_device = reassign_device_ownership,
+    .get_device_group_id = intel_iommu_group_id,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/drivers/passthrough/vtd/iommu.h b/drivers/passthrough/vtd/iommu.h
new file mode 100644
index 0000000..73c1657
--- /dev/null
+++ b/drivers/passthrough/vtd/iommu.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <xen/types.h>
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+
+#define    DMAR_VER_REG    0x0    /* Arch version supported by this IOMMU */
+#define    DMAR_CAP_REG    0x8    /* Hardware supported capabilities */
+#define    DMAR_ECAP_REG    0x10    /* Extended capabilities supported */
+#define    DMAR_GCMD_REG    0x18    /* Global command register */
+#define    DMAR_GSTS_REG    0x1c    /* Global status register */
+#define    DMAR_RTADDR_REG    0x20    /* Root entry table */
+#define    DMAR_CCMD_REG    0x28    /* Context command reg */
+#define    DMAR_FSTS_REG    0x34    /* Fault Status register */
+#define    DMAR_FECTL_REG    0x38    /* Fault control register */
+#define    DMAR_FEDATA_REG    0x3c    /* Fault event interrupt data register */
+#define    DMAR_FEADDR_REG    0x40    /* Fault event interrupt addr register */
+#define    DMAR_FEUADDR_REG 0x44    /* Upper address register */
+#define    DMAR_AFLOG_REG    0x58    /* Advanced Fault control */
+#define    DMAR_PMEN_REG    0x64    /* Enable Protected Memory Region */
+#define    DMAR_PLMBASE_REG 0x68    /* PMRR Low addr */
+#define    DMAR_PLMLIMIT_REG 0x6c    /* PMRR low limit */
+#define    DMAR_PHMBASE_REG 0x70    /* pmrr high base addr */
+#define    DMAR_PHMLIMIT_REG 0x78    /* pmrr high limit */
+#define    DMAR_IQH_REG    0x80    /* invalidation queue head */
+#define    DMAR_IQT_REG    0x88    /* invalidation queue tail */
+#define    DMAR_IQA_REG    0x90    /* invalidation queue addr */
+#define    DMAR_IRTA_REG   0xB8    /* intr remap */
+
+#define OFFSET_STRIDE        (9)
+#define dmar_readl(dmar, reg) readl(dmar + reg)
+#define dmar_writel(dmar, reg, val) writel(val, dmar + reg)
+#define dmar_readq(dmar, reg) ({ \
+        u32 lo, hi; \
+        lo = dmar_readl(dmar, reg); \
+        hi = dmar_readl(dmar, reg + 4); \
+        (((u64) hi) << 32) + lo; })
+#define dmar_writeq(dmar, reg, val) do {\
+        dmar_writel(dmar, reg, (u32)val); \
+        dmar_writel(dmar, reg + 4, (u32)((u64) val >> 32)); \
+    } while (0)
+
+#define VER_MAJOR(v)        (((v) & 0xf0) >> 4)
+#define VER_MINOR(v)        ((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_read_drain(c)    (((c) >> 55) & 1)
+#define cap_write_drain(c)    (((c) >> 54) & 1)
+#define cap_max_amask_val(c)    (((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)    ((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)       (((c) >> 39) & 1)
+
+#define cap_super_page_val(c)    (((c) >> 34) & 0xf)
+#define cap_super_offset(c)    (((find_first_bit(&cap_super_page_val(c), 4)) \
+                    * OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)    ((((c) >> 24) & 0x3ff) * 16)
+
+#define cap_isoch(c)        (((c) >> 23) & 1)
+#define cap_qos(c)        (((c) >> 22) & 1)
+#define cap_mgaw(c)        ((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)        (((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)    (((c) >> 7) & 1)
+#define cap_phmr(c)        (((c) >> 6) & 1)
+#define cap_plmr(c)        (((c) >> 5) & 1)
+#define cap_rwbf(c)        (((c) >> 4) & 1)
+#define cap_afl(c)        (((c) >> 3) & 1)
+#define cap_ndoms(c)        (1 << (4 + 2 * ((c) & 0x7)))
+
+/*
+ * Extended Capability Register
+ */
+
+#define ecap_niotlb_iunits(e)    ((((e) >> 24) & 0xff) + 1)
+#define ecap_iotlb_offset(e)     ((((e) >> 8) & 0x3ff) * 16)
+#define ecap_coherent(e)         ((e >> 0) & 0x1)
+#define ecap_queued_inval(e)     ((e >> 1) & 0x1)
+#define ecap_dev_iotlb(e)        ((e >> 2) & 0x1)
+#define ecap_intr_remap(e)       ((e >> 3) & 0x1)
+#define ecap_ext_intr(e)         ((e >> 4) & 0x1)
+#define ecap_cache_hints(e)      ((e >> 5) & 0x1)
+#define ecap_pass_thru(e)        ((e >> 6) & 0x1)
+
+/* IOTLB_REG */
+#define DMA_TLB_FLUSH_GRANU_OFFSET  60
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(x) (((x) >> 60) & 7)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
+#define DMA_TLB_DID(x) (((u64)(x & 0xffff)) << 32)
+
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_IVT (((u64)1) << 63)
+
+#define DMA_TLB_IVA_ADDR(x) ((((u64)x) >> 12) << 12)
+#define DMA_TLB_IVA_HINT(x) ((((u64)x) & 1) << 6)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE     (((u64)1) << 31)
+#define DMA_GCMD_SRTP   (((u64)1) << 30)
+#define DMA_GCMD_SFL    (((u64)1) << 29)
+#define DMA_GCMD_EAFL   (((u64)1) << 28)
+#define DMA_GCMD_WBF    (((u64)1) << 27)
+#define DMA_GCMD_QIE    (((u64)1) << 26)
+#define DMA_GCMD_IRE    (((u64)1) << 25)
+#define DMA_GCMD_SIRTP  (((u64)1) << 24)
+#define DMA_GCMD_CFI    (((u64)1) << 23)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES    (((u64)1) << 31)
+#define DMA_GSTS_RTPS   (((u64)1) << 30)
+#define DMA_GSTS_FLS    (((u64)1) << 29)
+#define DMA_GSTS_AFLS   (((u64)1) << 28)
+#define DMA_GSTS_WBFS   (((u64)1) << 27)
+#define DMA_GSTS_QIES   (((u64)1) <<26)
+#define DMA_GSTS_IRES   (((u64)1) <<25)
+#define DMA_GSTS_SIRTPS (((u64)1) << 24)
+#define DMA_GSTS_CFIS   (((u64)1) <<23)
+
+/* PMEN_REG */
+#define DMA_PMEN_EPM    (((u32)1) << 31)
+#define DMA_PMEN_PRS    (((u32)1) << 0)
+
+/* CCMD_REG */
+#define DMA_CCMD_INVL_GRANU_OFFSET  61
+#define DMA_CCMD_ICC   (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_CIRG(x) ((((u64)3) << 61) & x)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+#define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u64)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PFO ((u64)1 << 0)
+#define DMA_FSTS_PPF ((u64)1 << 1)
+#define DMA_FSTS_AFO ((u64)1 << 2)
+#define DMA_FSTS_APF ((u64)1 << 3)
+#define DMA_FSTS_IQE ((u64)1 << 4)
+#define DMA_FSTS_ICE ((u64)1 << 5)
+#define DMA_FSTS_ITE ((u64)1 << 6)
+#define DMA_FSTS_FAULTS    DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u64)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+    u64    val;
+    u64    rsvd1;
+};
+#define root_present(root)    ((root).val & 1)
+#define set_root_present(root) do {(root).val |= 1;} while(0)
+#define get_context_addr(root) ((root).val & PAGE_MASK_4K)
+#define set_root_value(root, value) \
+    do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
+
+struct context_entry {
+    u64 lo;
+    u64 hi;
+};
+#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while(0)
+#define context_clear_present(c) do {(c).lo &= ~1;} while(0)
+#define context_set_fault_enable(c) \
+    do {(c).lo &= (((u64)-1) << 2) | 1;} while(0)
+
+#define context_set_translation_type(c, val) do { \
+        (c).lo &= (((u64)-1) << 4) | 3; \
+        (c).lo |= (val & 3) << 2; \
+    } while(0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define CONTEXT_TT_DEV_IOTLB   1
+#define CONTEXT_TT_PASS_THRU   2
+
+#define context_set_address_root(c, val) \
+    do {(c).lo &= 0xfff; (c).lo |= (val) & PAGE_MASK_4K ;} while(0)
+#define context_set_address_width(c, val) \
+    do {(c).hi &= 0xfffffff8; (c).hi |= (val) & 7;} while(0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while(0)
+
+/* page table handling */
+#define LEVEL_STRIDE       (9)
+#define LEVEL_MASK         ((1 << LEVEL_STRIDE) - 1)
+#define PTE_NUM            (1 << LEVEL_STRIDE)
+#define level_to_agaw(val) ((val) - 2)
+#define agaw_to_level(val) ((val) + 2)
+#define agaw_to_width(val) (30 + val * LEVEL_STRIDE)
+#define width_to_agaw(w)   ((w - 30)/LEVEL_STRIDE)
+#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE)
+#define address_level_offset(addr, level) \
+            ((addr >> level_to_offset_bits(level)) & LEVEL_MASK)
+#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l))
+#define level_size(l) (1 << level_to_offset_bits(l))
+#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l))
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+    u64 val;
+};
+#define DMA_PTE_READ (1)
+#define DMA_PTE_WRITE (2)
+#define dma_clear_pte(p)    do {(p).val = 0;} while(0)
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0)
+#define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0)
+#define dma_set_pte_prot(p, prot) \
+            do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_set_pte_addr(p, addr) do {\
+            (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
+struct poll_info {
+    u64 saddr;
+    u32 udata;
+};
+
+#define VTD_PAGE_TABLE_LEVEL_3  3
+#define VTD_PAGE_TABLE_LEVEL_4  4
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+#define MAX_IOMMU_REGS 0xc0
+
+extern struct list_head acpi_drhd_units;
+extern struct list_head acpi_rmrr_units;
+extern struct list_head acpi_ioapic_units;
+
+struct iommu_flush {
+    int (*context)(void *iommu, u16 did, u16 source_id,
+                   u8 function_mask, u64 type, int non_present_entry_flush);
+    int (*iotlb)(void *iommu, u16 did, u64 addr, unsigned int size_order,
+                 u64 type, int non_present_entry_flush);
+};
+
+struct intel_iommu {
+    struct iommu_flush flush;
+};
+
+#endif
diff --git a/drivers/passthrough/vtd/utils.c b/drivers/passthrough/vtd/utils.c
new file mode 100644
index 0000000..bf9fb6b
--- /dev/null
+++ b/drivers/passthrough/vtd/utils.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ */
+
+#include <xen/sched.h>
+#include <xen/delay.h>
+#include <xen/iommu.h>
+#include <xen/time.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include "iommu.h"
+#include "dmar.h"
+#include "vtd.h"
+#include "extern.h"
+
+int is_usb_device(u8 bus, u8 devfn)
+{
+    u16 class = pci_conf_read16(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                                PCI_CLASS_DEVICE);
+    return (class == 0xc03);
+}
+
+/* Disable vt-d protected memory registers. */
+void disable_pmr(struct iommu *iommu)
+{
+    s_time_t start_time;
+    unsigned int val;
+
+    val = dmar_readl(iommu->reg, DMAR_PMEN_REG);
+    if ( !(val & DMA_PMEN_PRS) )
+        return;
+
+    dmar_writel(iommu->reg, DMAR_PMEN_REG, val & ~DMA_PMEN_EPM);
+    start_time = NOW();
+
+    for ( ; ; )
+    {
+        val = dmar_readl(iommu->reg, DMAR_PMEN_REG);
+        if ( (val & DMA_PMEN_PRS) == 0 )
+            break;
+
+        if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT )
+            panic("Disable PMRs timeout\n");
+
+        cpu_relax();
+    }
+
+    dprintk(XENLOG_INFO VTDPREFIX,
+            "Disabled protected memory registers\n");
+}
+
+void print_iommu_regs(struct acpi_drhd_unit *drhd)
+{
+    struct iommu *iommu = drhd->iommu;
+
+    printk("---- print_iommu_regs ----\n");
+    printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address);
+    printk("print_iommu_regs: DMAR_VER_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_VER_REG));
+    printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_CAP_REG));
+    printk("print_iommu_regs: n_fault_reg = %"PRIx64"\n",
+           cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_l = %"PRIx64"\n",
+           cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)));
+    printk("print_iommu_regs: fault_recording_offset_h = %"PRIx64"\n",
+           cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8);
+    printk("print_iommu_regs: fault_recording_reg_l = %"PRIx64"\n",
+           dmar_readq(iommu->reg,
+               cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG))));
+    printk("print_iommu_regs: fault_recording_reg_h = %"PRIx64"\n",
+           dmar_readq(iommu->reg,
+               cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8));
+    printk("print_iommu_regs: DMAR_ECAP_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_ECAP_REG));
+    printk("print_iommu_regs: DMAR_GCMD_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_GCMD_REG));
+    printk("print_iommu_regs: DMAR_GSTS_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_GSTS_REG));
+    printk("print_iommu_regs: DMAR_RTADDR_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_RTADDR_REG));
+    printk("print_iommu_regs: DMAR_CCMD_REG = %"PRIx64"\n",
+           dmar_readq(iommu->reg,DMAR_CCMD_REG));
+    printk("print_iommu_regs: DMAR_FSTS_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FSTS_REG));
+    printk("print_iommu_regs: DMAR_FECTL_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FECTL_REG));
+    printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FEDATA_REG));
+    printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FEADDR_REG));
+    printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n",
+           dmar_readl(iommu->reg,DMAR_FEUADDR_REG));
+}
+
+u32 get_level_index(unsigned long gmfn, int level)
+{
+    while ( --level )
+        gmfn = gmfn >> LEVEL_STRIDE;
+
+    return gmfn & LEVEL_MASK;
+}
+
+void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn)
+{
+    struct context_entry *ctxt_entry;
+    struct root_entry *root_entry;
+    struct dma_pte pte;
+    u64 *l;
+    u32 l_index, level;
+
+    printk("print_vtd_entries: iommu = %p bdf = %x:%x:%x gmfn = %"PRIx64"\n",
+           iommu, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), gmfn);
+
+    if ( iommu->root_maddr == 0 )
+    {
+        printk("    iommu->root_maddr = 0\n");
+        return;
+    }
+
+    root_entry = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
+ 
+    printk("    root_entry = %p\n", root_entry);
+    printk("    root_entry[%x] = %"PRIx64"\n", bus, root_entry[bus].val);
+    if ( !root_present(root_entry[bus]) )
+    {
+        unmap_vtd_domain_page(root_entry);
+        printk("    root_entry[%x] not present\n", bus);
+        return;
+    }
+
+    ctxt_entry =
+        (struct context_entry *)map_vtd_domain_page(root_entry[bus].val);
+    if ( ctxt_entry == NULL )
+    {
+        unmap_vtd_domain_page(root_entry);
+        printk("    ctxt_entry == NULL\n");
+        return;
+    }
+
+    printk("    context = %p\n", ctxt_entry);
+    printk("    context[%x] = %"PRIx64"_%"PRIx64"\n",
+           devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo);
+    if ( !context_present(ctxt_entry[devfn]) )
+    {
+        unmap_vtd_domain_page(ctxt_entry);
+        unmap_vtd_domain_page(root_entry);
+        printk("    ctxt_entry[%x] not present\n", devfn);
+        return;
+    }
+
+    level = agaw_to_level(context_address_width(ctxt_entry[devfn]));
+    if ( level != VTD_PAGE_TABLE_LEVEL_3 &&
+         level != VTD_PAGE_TABLE_LEVEL_4)
+    {
+        unmap_vtd_domain_page(ctxt_entry);
+        unmap_vtd_domain_page(root_entry);
+        printk("Unsupported VTD page table level (%d)!\n", level);
+    }
+
+    l = maddr_to_virt(ctxt_entry[devfn].lo);
+    do
+    {
+        l = (u64*)(((unsigned long)l >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K);
+        printk("    l%d = %p\n", level, l);
+        if ( l == NULL )
+        {
+            unmap_vtd_domain_page(ctxt_entry);
+            unmap_vtd_domain_page(root_entry);
+            printk("    l%d == NULL\n", level);
+            break;
+        }
+        l_index = get_level_index(gmfn, level);
+        printk("    l%d_index = %x\n", level, l_index);
+        printk("    l%d[%x] = %"PRIx64"\n", level, l_index, l[l_index]);
+
+        pte.val = l[l_index];
+        if ( !dma_pte_present(pte) )
+        {
+            unmap_vtd_domain_page(ctxt_entry);
+            unmap_vtd_domain_page(root_entry);
+            printk("    l%d[%x] not present\n", level, l_index);
+            break;
+        }
+
+        l = maddr_to_virt(l[l_index]);
+    } while ( --level );
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/drivers/passthrough/vtd/vtd.h b/drivers/passthrough/vtd/vtd.h
new file mode 100644
index 0000000..ec02d12
--- /dev/null
+++ b/drivers/passthrough/vtd/vtd.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#ifndef _VTD_H_
+#define _VTD_H_
+
+#include <xen/iommu.h>
+
+/* Accomodate both IOAPIC and IOSAPIC. */
+struct IO_xAPIC_route_entry {
+    __u32   vector      :  8,
+        delivery_mode   :  3,   /* 000: FIXED
+                                 * 001: lowest prio
+                                 * 111: ExtINT
+                                 */
+        dest_mode       :  1,   /* 0: physical, 1: logical */
+        delivery_status :  1,
+        polarity        :  1,
+        irr             :  1,
+        trigger         :  1,   /* 0: edge, 1: level */
+        mask            :  1,   /* 0: enabled, 1: disabled */
+        __reserved_2    : 15;
+   
+    union {
+        struct { __u32
+            __reserved_1    : 24,
+            physical_dest   :  4,
+            __reserved_2    :  4;
+        } physical;
+
+        struct { __u32
+            __reserved_1    : 24,
+            logical_dest    :  8;
+        } logical;
+
+#ifdef __ia64__
+        struct { __u32
+            __reserved_1    : 16,
+            dest_id         : 16;
+        };
+#endif
+    } dest;
+
+} __attribute__ ((packed));
+
+struct IO_APIC_route_remap_entry {
+    union {
+        u64 val;
+        struct {
+            u64 vector:8,
+            delivery_mode:3,
+            index_15:1,
+            delivery_status:1,
+            polarity:1,
+            irr:1,
+            trigger:1,
+            mask:1,
+            reserved:31,
+            format:1,
+            index_0_14:15;
+        };
+    };
+};
+
+struct msi_msg_remap_entry {
+    union {
+        u32 val;
+        struct {
+            u32 dontcare:2,
+                index_15:1,
+                SHV:1,
+                format:1,
+                index_0_14:15,
+                addr_id_val:12; /* Interrupt address identifier value,
+                                   must be 0FEEh */
+        };
+    } address_lo;   /* low 32 bits of msi message address */
+
+    u32	address_hi;	/* high 32 bits of msi message address */
+    u32	data;		/* msi message data */
+};
+
+unsigned int get_cache_line_size(void);
+void cacheline_flush(char *);
+void flush_all_cache(void);
+void *map_to_nocache_virt(int nr_iommus, u64 maddr);
+u64 alloc_pgtable_maddr(struct domain *d);
+void free_pgtable_maddr(u64 maddr);
+void *map_vtd_domain_page(u64 maddr);
+void unmap_vtd_domain_page(void *va);
+
+void iommu_flush_cache_entry(void *addr);
+void iommu_flush_cache_page(void *addr);
+
+#endif // _VTD_H_
diff --git a/drivers/passthrough/vtd/x86/Makefile b/drivers/passthrough/vtd/x86/Makefile
new file mode 100644
index 0000000..85243e3
--- /dev/null
+++ b/drivers/passthrough/vtd/x86/Makefile
@@ -0,0 +1 @@
+obj-y += vtd.o
diff --git a/drivers/passthrough/vtd/x86/vtd.c b/drivers/passthrough/vtd/x86/vtd.c
new file mode 100644
index 0000000..352684a
--- /dev/null
+++ b/drivers/passthrough/vtd/x86/vtd.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#include <xen/sched.h>
+#include <xen/domain_page.h>
+#include <asm/paging.h>
+#include <xen/iommu.h>
+#include <xen/numa.h>
+#include "../iommu.h"
+#include "../dmar.h"
+#include "../vtd.h"
+
+void *map_vtd_domain_page(u64 maddr)
+{
+    return map_domain_page(maddr >> PAGE_SHIFT_4K);
+}
+
+void unmap_vtd_domain_page(void *va)
+{
+    unmap_domain_page(va);
+}
+
+/* Allocate page table, return its machine address */
+u64 alloc_pgtable_maddr(struct domain *d)
+{
+    struct page_info *pg;
+    u64 *vaddr;
+    unsigned long mfn;
+
+    pg = alloc_domheap_page(NULL);
+    if ( !pg )
+        return 0;
+    mfn = page_to_mfn(pg);
+    vaddr = map_domain_page(mfn);
+    memset(vaddr, 0, PAGE_SIZE);
+
+    iommu_flush_cache_page(vaddr);
+    unmap_domain_page(vaddr);
+
+    return (u64)mfn << PAGE_SHIFT_4K;
+}
+
+void free_pgtable_maddr(u64 maddr)
+{
+    if ( maddr != 0 )
+        free_domheap_page(maddr_to_page(maddr));
+}
+
+unsigned int get_cache_line_size(void)
+{
+    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+}
+
+void cacheline_flush(char * addr)
+{
+    clflush(addr);
+}
+
+void flush_all_cache()
+{
+    wbinvd();
+}
+
+void *map_to_nocache_virt(int nr_iommus, u64 maddr)
+{
+    set_fixmap_nocache(FIX_IOMMU_REGS_BASE_0 + nr_iommus, maddr);
+    return (void *)fix_to_virt(FIX_IOMMU_REGS_BASE_0 + nr_iommus);
+}
+
+struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain)
+{
+    if ( !domain )
+        return NULL;
+
+    return domain->arch.hvm_domain.irq.dpci;
+}
+
+int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci)
+{
+    if ( !domain || !dpci )
+        return 0;
+
+    domain->arch.hvm_domain.irq.dpci = dpci;
+    return 1;
+}
+
+void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq)
+{
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+    struct hvm_irq_dpci *dpci = NULL;
+    struct dev_intx_gsi_link *digl, *tmp;
+    int i;
+
+    ASSERT(isairq < NR_ISAIRQS);
+    if ( !vtd_enabled)
+        return;
+
+    spin_lock(&d->evtchn_lock);
+
+    dpci = domain_get_irq_dpci(d);
+
+    if ( !dpci || !test_bit(isairq, dpci->isairq_map) )
+    {
+        spin_unlock(&d->evtchn_lock);
+        return;
+    }
+    /* Multiple mirq may be mapped to one isa irq */
+    for ( i = find_first_bit(dpci->mapping, NR_IRQS);
+          i < NR_IRQS;
+          i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) )
+    {
+        list_for_each_entry_safe ( digl, tmp,
+            &dpci->mirq[i].digl_list, list )
+        {
+            if ( hvm_irq->pci_link.route[digl->link] == isairq )
+            {
+                hvm_pci_intx_deassert(d, digl->device, digl->intx);
+                if ( --dpci->mirq[i].pending == 0 )
+                {
+                    stop_timer(&dpci->hvm_timer[irq_to_vector(i)]);
+                    pirq_guest_eoi(d, i);
+                }
+            }
+        }
+    }
+    spin_unlock(&d->evtchn_lock);
+}
diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h
index 227f763..8488da0 100644
--- a/include/asm-x86/acpi.h
+++ b/include/asm-x86/acpi.h
@@ -178,4 +178,6 @@ extern void acpi_reserve_bootmem(void);
 extern u8 x86_acpiid_to_apicid[];
 #define MAX_LOCAL_APIC 256
 
+extern int acpi_dmar_init(void);
+
 #endif /*_ASM_ACPI_H*/
diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h
index fd28d4a..7a34df4 100644
--- a/include/asm-x86/domain.h
+++ b/include/asm-x86/domain.h
@@ -222,6 +222,9 @@ struct arch_domain
 
     struct hvm_domain hvm_domain;
 
+    /* pass-throughed device list */
+    struct list_head pdev_list;
+
     struct paging_domain paging;
     struct p2m_domain p2m ;
 
diff --git a/include/asm-x86/hvm/domain.h b/include/asm-x86/hvm/domain.h
index 50802b3..88cc1cd 100644
--- a/include/asm-x86/hvm/domain.h
+++ b/include/asm-x86/hvm/domain.h
@@ -27,6 +27,7 @@
 #include <asm/hvm/io.h>
 #include <public/hvm/params.h>
 #include <public/hvm/save.h>
+#include <xen/hvm/iommu.h>
 
 struct hvm_ioreq_page {
     spinlock_t lock;
@@ -62,6 +63,9 @@ struct hvm_domain {
 
     uint64_t               params[HVM_NR_PARAMS];
 
+    /* Pass-through */
+    struct hvm_iommu       hvm_iommu;
+
 #if CONFIG_PAGING_LEVELS == 3
     bool_t                 amd_npt_4gb_warning;
 #endif
diff --git a/include/asm-x86/hvm/irq.h b/include/asm-x86/hvm/irq.h
index 7ab4446..b6e78ba 100644
--- a/include/asm-x86/hvm/irq.h
+++ b/include/asm-x86/hvm/irq.h
@@ -28,6 +28,7 @@
 #include <asm/hvm/vpic.h>
 #include <asm/hvm/vioapic.h>
 #include <public/hvm/save.h>
+#include <xen/hvm/irq.h>
 
 
 struct hvm_irq {
@@ -90,6 +91,8 @@ struct hvm_irq {
 
     /* Last VCPU that was delivered a LowestPrio interrupt. */
     u8 round_robin_prev_vcpu;
+
+    struct hvm_irq_dpci *dpci;
 };
 
 #define hvm_pci_intx_gsi(dev, intx)  \
diff --git a/include/xen/hvm/irq.h b/include/xen/hvm/irq.h
new file mode 100644
index 0000000..c0ec1a0
--- /dev/null
+++ b/include/xen/hvm/irq.h
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * irq.h
+ * 
+ * Interrupt distribution and delivery logic.
+ * 
+ * Copyright (c) 2006, K A Fraser, XenSource Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#ifndef __XEN_HVM_IRQ_H__
+#define __XEN_HVM_IRQ_H__
+
+#include <xen/types.h>
+#include <xen/spinlock.h>
+#include <asm/irq.h>
+#include <public/hvm/save.h>
+
+struct dev_intx_gsi_link {
+    struct list_head list;
+    uint8_t device;
+    uint8_t intx;
+    uint8_t gsi;
+    uint8_t link;
+};
+
+#define _HVM_IRQ_DPCI_MSI  0x1
+
+struct hvm_gmsi_info {
+    uint32_t gvec;
+    uint32_t gflags;
+};
+
+struct hvm_mirq_dpci_mapping {
+    uint32_t flags;
+    int pending;
+    struct list_head digl_list;
+    struct domain *dom;
+    struct hvm_gmsi_info gmsi;
+};
+
+struct hvm_girq_dpci_mapping {
+    uint8_t valid;
+    uint8_t device;
+    uint8_t intx;
+    uint8_t machine_gsi;
+};
+
+#define NR_ISAIRQS  16
+#define NR_LINK     4
+
+/* Protected by domain's evtchn_lock */
+struct hvm_irq_dpci {
+    /* Machine IRQ to guest device/intx mapping. */
+    DECLARE_BITMAP(mapping, NR_IRQS);
+    struct hvm_mirq_dpci_mapping mirq[NR_IRQS];
+    /* Guest IRQ to guest device/intx mapping. */
+    struct hvm_girq_dpci_mapping girq[NR_IRQS];
+    uint8_t msi_gvec_pirq[NR_VECTORS];
+    DECLARE_BITMAP(dirq_mask, NR_IRQS);
+    /* Record of mapped ISA IRQs */
+    DECLARE_BITMAP(isairq_map, NR_ISAIRQS);
+    /* Record of mapped Links */
+    uint8_t link_cnt[NR_LINK];
+    struct timer hvm_timer[NR_IRQS];
+};
+
+/* Modify state of a PCI INTx wire. */
+void hvm_pci_intx_assert(
+    struct domain *d, unsigned int device, unsigned int intx);
+void hvm_pci_intx_deassert(
+    struct domain *d, unsigned int device, unsigned int intx);
+
+/* Modify state of an ISA device's IRQ wire. */
+void hvm_isa_irq_assert(
+    struct domain *d, unsigned int isa_irq);
+void hvm_isa_irq_deassert(
+    struct domain *d, unsigned int isa_irq);
+
+void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
+
+void hvm_maybe_deassert_evtchn_irq(void);
+void hvm_assert_evtchn_irq(struct vcpu *v);
+void hvm_set_callback_via(struct domain *d, uint64_t via);
+
+void hvm_dirq_assist(struct vcpu *v);
+
+#endif /* __XEN_HVM_IRQ_H__ */
diff --git a/include/xen/irq.h b/include/xen/irq.h
index f1c1ef4..c145ad9 100644
--- a/include/xen/irq.h
+++ b/include/xen/irq.h
@@ -63,6 +63,10 @@ extern irq_desc_t irq_desc[NR_IRQS];
 
 extern int setup_irq(unsigned int, struct irqaction *);
 extern void free_irq(unsigned int);
+extern int request_irq(unsigned int irq,
+                       void (*handler)(int, void *, struct cpu_user_regs *),
+                       unsigned long irqflags, const char * devname,
+                       void *dev_id);
 
 extern hw_irq_controller no_irq_type;
 extern void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs);
diff --git a/include/xen/pci.h b/include/xen/pci.h
index fa01bd2..618c6bf 100644
--- a/include/xen/pci.h
+++ b/include/xen/pci.h
@@ -30,6 +30,7 @@
 #define PCI_BDF2(b,df)  ((((b) & 0xff) << 8) | ((df) & 0xff))
 
 struct pci_dev {
+    struct list_head alldevs_list;
     struct list_head domain_list;
     struct domain *domain;
     const u8 bus;
@@ -39,6 +40,25 @@ struct pci_dev {
 #define for_each_pdev(domain, pdev) \
     list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list)
 
+/*
+ * The pcidevs_lock protect alldevs_list, and the assignment for the
+ * devices, it also sync the access to the msi capability that is not
+ * interrupt handling related (the mask bit register).
+ */
+
+extern spinlock_t pcidevs_lock;
+
+struct pci_dev *alloc_pdev(u8 bus, u8 devfn);
+void free_pdev(struct pci_dev *pdev);
+struct pci_dev *pci_lock_pdev(int bus, int devfn);
+struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn);
+
+void pci_release_devices(struct domain *d);
+int pci_add_device(u8 bus, u8 devfn);
+int pci_remove_device(u8 bus, u8 devfn);
+struct pci_dev *pci_get_pdev(int bus, int devfn);
+struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn);
+
 uint8_t pci_conf_read8(
     unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg);
 uint16_t pci_conf_read16(