From: ddugger@redhat.com <ddugger@redhat.com> Date: Mon, 23 Mar 2009 10:22:57 -0600 Subject: [xen] add VT-d specific files Message-id: 200903231622.n2NGMvxa022045@sobek.n0ano.com O-Subject: [RHEL5.4 PATCH 4/21 V2] add VT-d specific files Bugzilla: 484227 RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com> RH-Acked-by: Chris Lalancette <clalance@redhat.com> major VT-d code, implement VT-d under xen/drivers/passthrough/vtd/, do initialization/destroy for VT-d at boot/exit, define irq structures and functions for VT-d. Upstream Status: Accepted (CS 15883, 15905, 17195, 17973, 18649) BZ: 484227 Signed-off-by: Weidong Han <weidong.han@intel.com> Signed-off-by: Gerd Hoffman <kraxel@redhat.com> Signed-off-by: Don Dugger <donald.d.dugger@intel.com> diff --git a/arch/x86/acpi/boot.c b/arch/x86/acpi/boot.c index a4dddc9..f929ecc 100644 --- a/arch/x86/acpi/boot.c +++ b/arch/x86/acpi/boot.c @@ -919,6 +919,8 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + acpi_dmar_init(); + return 0; } diff --git a/arch/x86/domain.c b/arch/x86/domain.c index 0651a71..bd1f214 100644 --- a/arch/x86/domain.c +++ b/arch/x86/domain.c @@ -44,6 +44,7 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/msr.h> +#include <xen/iommu.h> #ifdef CONFIG_COMPAT #include <compat/vcpu.h> #endif @@ -416,6 +417,8 @@ int arch_domain_create(struct domain *d) int vcpuid, pdpt_order; int rc = -ENOMEM; + INIT_LIST_HEAD(&d->arch.pdev_list); + d->arch.relmem = RELMEM_not_started; INIT_LIST_HEAD(&d->arch.relmem_list); pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); @@ -480,12 +483,18 @@ int arch_domain_create(struct domain *d) clear_page(d->shared_info); share_xen_page_with_guest( virt_to_page(d->shared_info), d, XENSHARE_writable); + + if ( (rc = iommu_domain_init(d)) != 0 ) + goto fail; } if ( is_hvm_domain(d) ) { if ( (rc = hvm_domain_initialise(d)) != 0 ) + { + iommu_domain_destroy(d); goto fail; + } } else { @@ -514,6 +523,10 @@ void arch_domain_destroy(struct domain *d) if ( is_hvm_domain(d) ) hvm_domain_destroy(d); + pci_release_devices(d); + if ( !is_idle_domain(d) ) + iommu_domain_destroy(d); + paging_final_teardown(d); free_xenheap_pages( diff --git a/arch/x86/irq.c b/arch/x86/irq.c index 4d17471..b9d761c 100644 --- a/arch/x86/irq.c +++ b/arch/x86/irq.c @@ -98,6 +98,39 @@ asmlinkage void do_IRQ(struct cpu_user_regs *regs) spin_unlock(&desc->lock); } +int request_irq(unsigned int irq, + void (*handler)(int, void *, struct cpu_user_regs *), + unsigned long irqflags, const char * devname, void *dev_id) +{ + struct irqaction * action; + int retval; + + /* + * Sanity-check: shared interrupts must pass in a real dev-ID, + * otherwise we'll have trouble later trying to figure out + * which interrupt is which (messes up the interrupt freeing + * logic etc). + */ + if (irq >= NR_IRQS) + return -EINVAL; + if (!handler) + return -EINVAL; + + action = xmalloc(struct irqaction); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->name = devname; + action->dev_id = dev_id; + + retval = setup_irq(irq, action); + if (retval) + xfree(action); + + return retval; +} + void free_irq(unsigned int irq) { unsigned int vector = irq_to_vector(irq); diff --git a/arch/x86/setup.c b/arch/x86/setup.c index c621786..4ff4f6b 100644 --- a/arch/x86/setup.c +++ b/arch/x86/setup.c @@ -1122,6 +1122,14 @@ void arch_get_xen_caps(xen_capabilities_info_t *info) #endif } +int xen_in_range(paddr_t start, paddr_t end) +{ + start = max_t(paddr_t, start, xenheap_phys_start); + end = min_t(paddr_t, end, xenheap_phys_end); + + return start < end; +} + /* * Local variables: * mode: C diff --git a/drivers/Makefile b/drivers/Makefile index 1a41cee..fccfbb3 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -1,4 +1,6 @@ subdir-y += char subdir-y += pci +subdir-$(x86_32) += passthrough +subdir-$(x86_64) += passthrough subdir-$(HAS_ACPI) += acpi subdir-$(HAS_VGA) += video diff --git a/drivers/passthrough/Makefile b/drivers/passthrough/Makefile new file mode 100644 index 0000000..662f89c --- /dev/null +++ b/drivers/passthrough/Makefile @@ -0,0 +1,5 @@ +subdir-$(x86_32) += vtd +subdir-$(x86_64) += vtd + +obj-y += iommu.o +obj-y += pci.o diff --git a/drivers/passthrough/iommu.c b/drivers/passthrough/iommu.c new file mode 100644 index 0000000..ea345c7 --- /dev/null +++ b/drivers/passthrough/iommu.c @@ -0,0 +1,250 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/sched.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <xen/paging.h> +#include <xen/guest_access.h> + +static void parse_iommu_param(char *s); +int intel_vtd_setup(void); + +/* + * The 'iommu' parameter enables the IOMMU. Optional comma separated + * value may contain: + * + * off|no|false|disable Disable IOMMU (default) + * force|required Don't boot unless IOMMU is enabled + * passthrough Bypass VT-d translation for Dom0 + */ +custom_param("iommu", parse_iommu_param); +int iommu_enabled = 0; +int force_iommu = 0; +int iommu_passthrough = 0; + +static void __init parse_iommu_param(char *s) +{ + char *ss; + iommu_enabled = 1; + + do { + ss = strchr(s, ','); + if ( ss ) + *ss = '\0'; + + if ( !strcmp(s, "off") || !strcmp(s, "no") || !strcmp(s, "false") || + !strcmp(s, "0") || !strcmp(s, "disable") ) + iommu_enabled = 0; + else if ( !strcmp(s, "force") || !strcmp(s, "required") ) + force_iommu = 1; + else if ( !strcmp(s, "passthrough") ) + iommu_passthrough = 1; + + s = ss + 1; + } while ( ss ); +} + +int iommu_domain_init(struct domain *domain) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + spin_lock_init(&hd->mapping_lock); + INIT_LIST_HEAD(&hd->g2m_ioport_list); + + if ( !iommu_enabled ) + return 0; + + hd->platform_ops = iommu_get_ops(); + return hd->platform_ops->init(domain); +} + +int iommu_add_device(struct pci_dev *pdev) +{ + struct hvm_iommu *hd; + + if ( !pdev->domain ) + return -EINVAL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + hd = domain_hvm_iommu(pdev->domain); + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->add_device(pdev); +} + +int iommu_remove_device(struct pci_dev *pdev) +{ + struct hvm_iommu *hd; + if ( !pdev->domain ) + return -EINVAL; + + hd = domain_hvm_iommu(pdev->domain); + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->remove_device(pdev); +} + +int assign_device(struct domain *d, u8 bus, u8 devfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + int rc = 0; + + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + spin_lock(&pcidevs_lock); + rc = hd->platform_ops->assign_device(d, bus, devfn); + spin_unlock(&pcidevs_lock); + return rc; +} + +void iommu_domain_destroy(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct list_head *ioport_list, *tmp; + struct g2m_ioport *ioport; + + if ( !iommu_enabled || !hd->platform_ops ) + return; + + if ( hd ) + { + list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list ) + { + ioport = list_entry(ioport_list, struct g2m_ioport, list); + list_del(&ioport->list); + xfree(ioport); + } + } + + return hd->platform_ops->teardown(d); +} + +int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->map_page(d, gfn, mfn); +} + +int iommu_unmap_page(struct domain *d, unsigned long gfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + if ( !iommu_enabled || !hd->platform_ops ) + return 0; + + return hd->platform_ops->unmap_page(d, gfn); +} + +/* caller should hold the pcidevs_lock */ +int deassign_device(struct domain *d, u8 bus, u8 devfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct pci_dev *pdev = NULL; + + if ( !iommu_enabled || !hd->platform_ops ) + return -EINVAL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(bus, devfn); + if (!pdev) + return -ENODEV; + + if (pdev->domain != d) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: deassign a device not owned\n"); + return -EINVAL; + } + + return hd->platform_ops->reassign_device(d, dom0, bus, devfn); +} + +static int iommu_setup(void) +{ + int rc = -ENODEV; + + if ( !iommu_enabled ) + goto out; + + rc = iommu_hardware_setup(); + + iommu_enabled = (rc == 0); + + out: + if ( force_iommu && !iommu_enabled ) + panic("IOMMU setup failed, crash Xen for security purpose!\n"); + + printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); + return rc; +} +__initcall(iommu_setup); + +int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn, + XEN_GUEST_HANDLE_64(uint32_t) buf, int max_sdevs) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct pci_dev *pdev; + int group_id, sdev_id; + u32 bdf; + int i = 0; + struct iommu_ops *ops = hd->platform_ops; + + if ( !iommu_enabled || !ops || !ops->get_device_group_id ) + return 0; + + group_id = ops->get_device_group_id(bus, devfn); + + spin_lock(&pcidevs_lock); + for_each_pdev( d, pdev ) + { + if ( (pdev->bus == bus) && (pdev->devfn == devfn) ) + continue; + + sdev_id = ops->get_device_group_id(pdev->bus, pdev->devfn); + if ( (sdev_id == group_id) && (i < max_sdevs) ) + { + bdf = 0; + bdf |= (pdev->bus & 0xff) << 16; + bdf |= (pdev->devfn & 0xff) << 8; + if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) ) + { + spin_unlock(&pcidevs_lock); + return -1; + } + i++; + } + } + spin_unlock(&pcidevs_lock); + + return i; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/drivers/passthrough/pci.c b/drivers/passthrough/pci.c new file mode 100644 index 0000000..c9a744c --- /dev/null +++ b/drivers/passthrough/pci.c @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2008, Netronome Systems, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/sched.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <xen/list.h> +#include <xen/prefetch.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <asm/hvm/irq.h> +#include <xen/delay.h> +#include <xen/keyhandler.h> + + +LIST_HEAD(alldevs_list); +spinlock_t pcidevs_lock = SPIN_LOCK_UNLOCKED; + +struct pci_dev *alloc_pdev(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( pdev->bus == bus && pdev->devfn == devfn ) + return pdev; + + pdev = xmalloc(struct pci_dev); + if ( !pdev ) + return NULL; + memset(pdev, 0, sizeof(struct pci_dev)); + + *((u8*) &pdev->bus) = bus; + *((u8*) &pdev->devfn) = devfn; + pdev->domain = NULL; + list_add(&pdev->alldevs_list, &alldevs_list); + + return pdev; +} + +void free_pdev(struct pci_dev *pdev) +{ + list_del(&pdev->alldevs_list); + xfree(pdev); +} + +struct pci_dev *pci_get_pdev(int bus, int devfn) +{ + struct pci_dev *pdev = NULL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( (pdev->bus == bus || bus == -1) && + (pdev->devfn == devfn || devfn == -1) ) + { + return pdev; + } + + return NULL; +} + +struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn) +{ + struct pci_dev *pdev = NULL; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( (pdev->bus == bus || bus == -1) && + (pdev->devfn == devfn || devfn == -1) && + (pdev->domain == d) ) + { + return pdev; + } + + return NULL; +} + +int pci_add_device(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + int ret = -ENOMEM; + + spin_lock(&pcidevs_lock); + pdev = alloc_pdev(bus, devfn); + if ( !pdev ) + goto out; + + ret = 0; + if ( !pdev->domain ) + { + pdev->domain = dom0; + ret = iommu_add_device(pdev); + if ( ret ) + goto out; + + list_add(&pdev->domain_list, &dom0->arch.pdev_list); + } + +out: + spin_unlock(&pcidevs_lock); + printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + return ret; +} + +int pci_remove_device(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + int ret = -ENODEV;; + + spin_lock(&pcidevs_lock); + list_for_each_entry ( pdev, &alldevs_list, alldevs_list ) + if ( pdev->bus == bus && pdev->devfn == devfn ) + { + ret = iommu_remove_device(pdev); + if ( pdev->domain ) + list_del(&pdev->domain_list); + free_pdev(pdev); + printk(XENLOG_DEBUG "PCI remove device %02x:%02x.%x\n", bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + break; + } + + spin_unlock(&pcidevs_lock); + return ret; +} + +static void pci_clean_dpci_irqs(struct domain *d) +{ + struct hvm_irq_dpci *hvm_irq_dpci = NULL; + uint32_t i; + struct list_head *digl_list, *tmp; + struct dev_intx_gsi_link *digl; + + if ( !iommu_enabled ) + return; + + spin_lock(&d->evtchn_lock); + hvm_irq_dpci = domain_get_irq_dpci(d); + if ( hvm_irq_dpci != NULL ) + { + for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS); + i < NR_IRQS; + i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) ) + { + pirq_guest_unbind(d, i); + kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]); + + list_for_each_safe ( digl_list, tmp, + &hvm_irq_dpci->mirq[i].digl_list ) + { + digl = list_entry(digl_list, + struct dev_intx_gsi_link, list); + list_del(&digl->list); + xfree(digl); + } + } + + d->arch.hvm_domain.irq.dpci = NULL; + xfree(hvm_irq_dpci); + } + spin_unlock(&d->evtchn_lock); +} + +void pci_release_devices(struct domain *d) +{ + struct pci_dev *pdev; + u8 bus, devfn; + + spin_lock(&pcidevs_lock); + pci_clean_dpci_irqs(d); + while ( (pdev = pci_get_pdev_by_domain(d, -1, -1)) ) + { + bus = pdev->bus; devfn = pdev->devfn; + deassign_device(d, bus, devfn); + } + spin_unlock(&pcidevs_lock); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/drivers/passthrough/vtd/Makefile b/drivers/passthrough/vtd/Makefile new file mode 100644 index 0000000..61bc48b --- /dev/null +++ b/drivers/passthrough/vtd/Makefile @@ -0,0 +1,6 @@ +subdir-$(x86_32) += x86 +subdir-$(x86_64) += x86 + +obj-y += iommu.o +obj-y += dmar.o +obj-y += utils.o diff --git a/drivers/passthrough/vtd/dmar.c b/drivers/passthrough/vtd/dmar.c new file mode 100644 index 0000000..740a7e7 --- /dev/null +++ b/drivers/passthrough/vtd/dmar.c @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen + */ + +#include <xen/init.h> +#include <xen/bitmap.h> +#include <xen/kernel.h> +#include <xen/acpi.h> +#include <xen/mm.h> +#include <xen/xmalloc.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <asm/string.h> +#include "dmar.h" + +int vtd_enabled = 1; + +#undef PREFIX +#define PREFIX VTDPREFIX "ACPI DMAR:" +#define DEBUG + +#define MIN_SCOPE_LEN (sizeof(struct acpi_pci_path) + \ + sizeof(struct acpi_dev_scope)) + +LIST_HEAD(acpi_drhd_units); +LIST_HEAD(acpi_rmrr_units); +LIST_HEAD(acpi_atsr_units); + +u8 dmar_host_address_width; + +void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus) +{ + sub_bus &= 0xff; + if (sec_bus > sub_bus) + return; + + while ( sec_bus <= sub_bus ) + set_bit(sec_bus++, scope->buses); +} + +void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus) +{ + sub_bus &= 0xff; + if (sec_bus > sub_bus) + return; + + while ( sec_bus <= sub_bus ) + clear_bit(sec_bus++, scope->buses); +} + +static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd) +{ + /* + * add INCLUDE_ALL at the tail, so scan the list will find it at + * the very end. + */ + if ( drhd->include_all ) + list_add_tail(&drhd->list, &acpi_drhd_units); + else + list_add(&drhd->list, &acpi_drhd_units); + return 0; +} + +static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr) +{ + list_add(&rmrr->list, &acpi_rmrr_units); + return 0; +} + +static void __init disable_all_dmar_units(void) +{ + struct acpi_drhd_unit *drhd, *_drhd; + struct acpi_rmrr_unit *rmrr, *_rmrr; + struct acpi_atsr_unit *atsr, *_atsr; + + list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list ) + { + list_del(&drhd->list); + xfree(drhd); + } + list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list ) + { + list_del(&rmrr->list); + xfree(rmrr); + } + list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list ) + { + list_del(&atsr->list); + xfree(atsr); + } +} + +static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr) +{ + /* + * add ALL_PORTS at the tail, so scan the list will find it at + * the very end. + */ + if ( atsr->all_ports ) + list_add_tail(&atsr->list, &acpi_atsr_units); + else + list_add(&atsr->list, &acpi_atsr_units); + return 0; +} + +struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn) +{ + struct acpi_drhd_unit *drhd; + struct acpi_drhd_unit *found = NULL, *include_all = NULL; + int i; + + list_for_each_entry ( drhd, &acpi_drhd_units, list ) + { + for (i = 0; i < drhd->scope.devices_cnt; i++) + if ( drhd->scope.devices[i] == PCI_BDF2(bus, devfn) ) + return drhd; + + if ( test_bit(bus, drhd->scope.buses) ) + found = drhd; + + if ( drhd->include_all ) + include_all = drhd; + } + + return found ? found : include_all; +} + +struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn) +{ + struct acpi_atsr_unit *atsr; + struct acpi_atsr_unit *found = NULL, *include_all = NULL; + int i; + + list_for_each_entry ( atsr, &acpi_atsr_units, list ) + { + for (i = 0; i < atsr->scope.devices_cnt; i++) + if ( atsr->scope.devices[i] == PCI_BDF2(bus, devfn) ) + return atsr; + + if ( test_bit(bus, atsr->scope.buses) ) + found = atsr; + + if ( atsr->all_ports ) + include_all = atsr; + } + + return found ? found : include_all; +} + +/* + * Count number of devices in device scope. Do not include PCI sub + * hierarchies. + */ +static int scope_device_count(void *start, void *end) +{ + struct acpi_dev_scope *scope; + int count = 0; + + while ( start < end ) + { + scope = start; + if ( (scope->length < MIN_SCOPE_LEN) || + (scope->dev_type >= ACPI_DEV_ENTRY_COUNT) ) + { + dprintk(XENLOG_WARNING VTDPREFIX, "Invalid device scope.\n"); + return -EINVAL; + } + + if ( scope->dev_type == ACPI_DEV_ENDPOINT || + scope->dev_type == ACPI_DEV_IOAPIC || + scope->dev_type == ACPI_DEV_MSI_HPET ) + count++; + + start += scope->length; + } + + return count; +} + + +static int __init acpi_parse_dev_scope(void *start, void *end, + void *acpi_entry, int type) +{ + struct dmar_scope *scope = acpi_entry; + struct acpi_ioapic_unit *acpi_ioapic_unit; + struct acpi_dev_scope *acpi_scope; + u16 bus, sub_bus, sec_bus; + struct acpi_pci_path *path; + int depth, cnt, didx = 0; + + if ( (cnt = scope_device_count(start, end)) < 0 ) + return cnt; + + scope->devices_cnt = cnt; + if ( cnt > 0 ) + { + scope->devices = xmalloc_array(u16, cnt); + if ( !scope->devices ) + return -ENOMEM; + memset(scope->devices, 0, sizeof(u16) * cnt); + } + + while ( start < end ) + { + acpi_scope = start; + path = (struct acpi_pci_path *)(acpi_scope + 1); + depth = (acpi_scope->length - sizeof(struct acpi_dev_scope)) + / sizeof(struct acpi_pci_path); + bus = acpi_scope->start_bus; + + while ( --depth > 0 ) + { + bus = pci_conf_read8(bus, path->dev, path->fn, PCI_SECONDARY_BUS); + path++; + } + + switch ( acpi_scope->dev_type ) + { + case ACPI_DEV_P2PBRIDGE: + sec_bus = pci_conf_read8( + bus, path->dev, path->fn, PCI_SECONDARY_BUS); + sub_bus = pci_conf_read8( + bus, path->dev, path->fn, PCI_SUBORDINATE_BUS); + dprintk(XENLOG_INFO VTDPREFIX, + "found bridge: bdf = %x:%x.%x sec = %x sub = %x\n", + bus, path->dev, path->fn, sec_bus, sub_bus); + + dmar_scope_add_buses(scope, sec_bus, sub_bus); + break; + + case ACPI_DEV_MSI_HPET: + dprintk(XENLOG_INFO VTDPREFIX, "found MSI HPET: bdf = %x:%x.%x\n", + bus, path->dev, path->fn); + scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); + break; + + case ACPI_DEV_ENDPOINT: + dprintk(XENLOG_INFO VTDPREFIX, "found endpoint: bdf = %x:%x.%x\n", + bus, path->dev, path->fn); + scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); + break; + + case ACPI_DEV_IOAPIC: + dprintk(XENLOG_INFO VTDPREFIX, "found IOAPIC: bdf = %x:%x.%x\n", + bus, path->dev, path->fn); + + if ( type == DMAR_TYPE ) + { + struct acpi_drhd_unit *drhd = acpi_entry; + acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit); + if ( !acpi_ioapic_unit ) + return -ENOMEM; + acpi_ioapic_unit->apic_id = acpi_scope->enum_id; + acpi_ioapic_unit->ioapic.bdf.bus = bus; + acpi_ioapic_unit->ioapic.bdf.dev = path->dev; + acpi_ioapic_unit->ioapic.bdf.func = path->fn; + list_add(&acpi_ioapic_unit->list, &drhd->ioapic_list); + } + + scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); + break; + } + + start += acpi_scope->length; + } + + return 0; +} + +static int __init +acpi_parse_one_drhd(struct acpi_dmar_entry_header *header) +{ + struct acpi_table_drhd * drhd = (struct acpi_table_drhd *)header; + void *dev_scope_start, *dev_scope_end; + struct acpi_drhd_unit *dmaru; + int ret = 0; + static int include_all = 0; + + dmaru = xmalloc(struct acpi_drhd_unit); + if ( !dmaru ) + return -ENOMEM; + memset(dmaru, 0, sizeof(struct acpi_drhd_unit)); + + dmaru->address = drhd->address; + dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */ + INIT_LIST_HEAD(&dmaru->ioapic_list); + dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n", + dmaru->address); + + dev_scope_start = (void *)(drhd + 1); + dev_scope_end = ((void *)drhd) + header->length; + ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, + dmaru, DMAR_TYPE); + + if ( dmaru->include_all ) + { + dprintk(XENLOG_INFO VTDPREFIX, "found INCLUDE_ALL\n"); + /* Only allow one INCLUDE_ALL */ + if ( include_all ) + { + dprintk(XENLOG_WARNING VTDPREFIX, + "Only one INCLUDE_ALL device scope is allowed\n"); + ret = -EINVAL; + } + include_all = 1; + } + + if ( ret ) + xfree(dmaru); + else + acpi_register_drhd_unit(dmaru); + return ret; +} + +static int __init +acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header) +{ + struct acpi_table_rmrr *rmrr = (struct acpi_table_rmrr *)header; + struct acpi_rmrr_unit *rmrru; + void *dev_scope_start, *dev_scope_end; + int ret = 0; + + if ( rmrr->base_address >= rmrr->end_address ) + { + dprintk(XENLOG_ERR VTDPREFIX, + "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n", + rmrr->base_address, rmrr->end_address); + return -EFAULT; + } + + rmrru = xmalloc(struct acpi_rmrr_unit); + if ( !rmrru ) + return -ENOMEM; + memset(rmrru, 0, sizeof(struct acpi_rmrr_unit)); + + rmrru->base_address = rmrr->base_address; + rmrru->end_address = rmrr->end_address; + dev_scope_start = (void *)(rmrr + 1); + dev_scope_end = ((void *)rmrr) + header->length; + ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, + rmrru, RMRR_TYPE); + + if ( ret || (rmrru->scope.devices_cnt == 0) ) + xfree(rmrru); + else + acpi_register_rmrr_unit(rmrru); + return ret; +} + +static int __init +acpi_parse_one_atsr(struct acpi_dmar_entry_header *header) +{ + struct acpi_table_atsr *atsr = (struct acpi_table_atsr *)header; + struct acpi_atsr_unit *atsru; + int ret = 0; + static int all_ports; + void *dev_scope_start, *dev_scope_end; + + atsru = xmalloc(struct acpi_atsr_unit); + if ( !atsru ) + return -ENOMEM; + memset(atsru, 0, sizeof(struct acpi_atsr_unit)); + + atsru->all_ports = atsr->flags & 1; /* BIT0: ALL_PORTS */ + if ( !atsru->all_ports ) + { + dev_scope_start = (void *)(atsr + 1); + dev_scope_end = ((void *)atsr) + header->length; + ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, + atsru, ATSR_TYPE); + } + else + { + dprintk(XENLOG_INFO VTDPREFIX, "found ALL_PORTS\n"); + /* Only allow one ALL_PORTS */ + if ( all_ports ) + { + dprintk(XENLOG_WARNING VTDPREFIX, + "Only one ALL_PORTS device scope is allowed\n"); + ret = -EINVAL; + } + all_ports = 1; + } + + if ( ret ) + xfree(atsr); + else + acpi_register_atsr_unit(atsru); + return ret; +} + +static int __init acpi_parse_dmar(unsigned long phys_addr, + unsigned long size) +{ + struct acpi_table_dmar *dmar; + struct acpi_dmar_entry_header *entry_header; + int ret = 0; + + if ( !phys_addr || !size ) + return -EINVAL; + + dmar = (struct acpi_table_dmar *)__acpi_map_table(phys_addr, size); + if ( !dmar ) + { + dprintk(XENLOG_WARNING VTDPREFIX, "Unable to map DMAR\n"); + return -ENODEV; + } + + if ( !dmar->haw ) + { + dprintk(XENLOG_WARNING VTDPREFIX, "Zero: Invalid DMAR width\n"); + if ( force_iommu ) + panic("acpi_parse_dmar: Invalid DMAR width," + " crash Xen for security purpose!\n"); + return -EINVAL; + } + + dmar_host_address_width = dmar->haw + 1; + dprintk(XENLOG_INFO VTDPREFIX, "Host address width %d\n", + dmar_host_address_width); + + entry_header = (struct acpi_dmar_entry_header *)(dmar + 1); + while ( ((unsigned long)entry_header) < + (((unsigned long)dmar) + size) ) + { + switch ( entry_header->type ) + { + case ACPI_DMAR_DRHD: + dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_DRHD\n"); + ret = acpi_parse_one_drhd(entry_header); + break; + case ACPI_DMAR_RMRR: + dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_RMRR\n"); + ret = acpi_parse_one_rmrr(entry_header); + break; + case ACPI_DMAR_ATSR: + dprintk(XENLOG_INFO VTDPREFIX, "found ACPI_DMAR_ATSR\n"); + ret = acpi_parse_one_atsr(entry_header); + break; + default: + dprintk(XENLOG_WARNING VTDPREFIX, "Unknown DMAR structure type\n"); + ret = -EINVAL; + break; + } + if ( ret ) + break; + + entry_header = ((void *)entry_header + entry_header->length); + } + + /* Zap APCI DMAR signature to prevent dom0 using vt-d HW. */ + dmar->header.signature[0] = '\0'; + + if ( ret ) + { + if ( force_iommu ) + panic("acpi_parse_dmar: Failed to parse ACPI DMAR," + " crash Xen for security purpose!\n"); + else + { + printk(XENLOG_WARNING + "Failed to parse ACPI DMAR. Disabling VT-d.\n"); + disable_all_dmar_units(); + } + } + + return ret; +} + +int acpi_dmar_init(void) +{ + int rc; + + rc = -ENODEV; + if ( force_iommu ) + iommu_enabled = 1; + + if ( !iommu_enabled ) + goto fail; + + acpi_table_parse(ACPI_DMAR, acpi_parse_dmar); + + if ( list_empty(&acpi_drhd_units) ) + goto fail; + + printk("Intel VT-d has been enabled\n"); + + return 0; + + fail: + if ( force_iommu ) + panic("acpi_dmar_init: acpi_dmar_init failed," + " crash Xen for security purpose!\n"); + + vtd_enabled = 0; + return rc; +} diff --git a/drivers/passthrough/vtd/dmar.h b/drivers/passthrough/vtd/dmar.h new file mode 100644 index 0000000..c2132e2 --- /dev/null +++ b/drivers/passthrough/vtd/dmar.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + */ + +#ifndef _DMAR_H_ +#define _DMAR_H_ + +#include <xen/list.h> +#include <xen/iommu.h> + +extern u8 dmar_host_address_width; + +/* This one is for interrupt remapping */ +struct acpi_ioapic_unit { + struct list_head list; + int apic_id; + union { + u16 info; + struct { + u16 func: 3, + dev: 5, + bus: 8; + }bdf; + }ioapic; +}; + +struct dmar_scope { + DECLARE_BITMAP(buses, 256); /* buses owned by this unit */ + u16 *devices; /* devices owned by this unit */ + int devices_cnt; +}; + +struct acpi_drhd_unit { + struct dmar_scope scope; /* must be first member of struct */ + struct list_head list; + u64 address; /* register base address of the unit */ + u8 include_all:1; + struct iommu *iommu; + struct list_head ioapic_list; +}; + +struct acpi_rmrr_unit { + struct dmar_scope scope; /* must be first member of struct */ + struct list_head list; + u64 base_address; + u64 end_address; + u8 allow_all:1; +}; + +struct acpi_atsr_unit { + struct dmar_scope scope; /* must be first member of struct */ + struct list_head list; + u8 all_ports:1; +}; + + +#define for_each_drhd_unit(drhd) \ + list_for_each_entry(drhd, &acpi_drhd_units, list) + +#define for_each_rmrr_device(rmrr, bdf, idx) \ + list_for_each_entry(rmrr, &acpi_rmrr_units, list) \ + /* assume there never is a bdf == 0 */ \ + for (idx = 0; (bdf = rmrr->scope.devices[idx]) && \ + idx < rmrr->scope.devices_cnt; idx++) + +struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn); +struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn); +void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec, u16 sub); +void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec, u16 sub); + +#define DMAR_TYPE 1 +#define RMRR_TYPE 2 +#define ATSR_TYPE 3 + +#define DMAR_OPERATION_TIMEOUT MILLISECS(1000) + +int vtd_hw_check(void); +void disable_pmr(struct iommu *iommu); +int is_usb_device(u8 bus, u8 devfn); + +#endif /* _DMAR_H_ */ diff --git a/drivers/passthrough/vtd/extern.h b/drivers/passthrough/vtd/extern.h new file mode 100644 index 0000000..6fec3e5 --- /dev/null +++ b/drivers/passthrough/vtd/extern.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Weidong Han <weidong.han@intel.com> + */ + +#ifndef _VTD_EXTERN_H_ +#define _VTD_EXTERN_H_ + +#include "dmar.h" + + +void print_iommu_regs(struct acpi_drhd_unit *drhd); +void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn); +void dump_iommu_info(unsigned char key); + +void clear_fault_bits(struct iommu *iommu); + +#endif // _VTD_EXTERN_H_ diff --git a/drivers/passthrough/vtd/iommu.c b/drivers/passthrough/vtd/iommu.c new file mode 100644 index 0000000..988182f --- /dev/null +++ b/drivers/passthrough/vtd/iommu.c @@ -0,0 +1,1884 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen + */ + +#include <xen/irq.h> +#include <xen/sched.h> +#include <xen/xmalloc.h> +#include <xen/domain_page.h> +#include <xen/iommu.h> +#include <asm/hvm/iommu.h> +#include <xen/numa.h> +#include <xen/time.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include <xen/keyhandler.h> +#include "iommu.h" +#include "dmar.h" +#include "extern.h" +#include "vtd.h" + +#define domain_iommu_domid(d) ((d)->arch.hvm_domain.hvm_iommu.iommu_domid) + +static spinlock_t domid_bitmap_lock; /* protect domain id bitmap */ +static int domid_bitmap_size; /* domain id bitmap size in bits */ +static unsigned long *domid_bitmap; /* iommu domain id bitmap */ + +static void setup_dom0_devices(struct domain *d); +static void setup_dom0_rmrr(struct domain *d); + +#define DID_FIELD_WIDTH 16 +#define DID_HIGH_OFFSET 8 +static void context_set_domain_id(struct context_entry *context, + struct domain *d) +{ + domid_t iommu_domid = domain_iommu_domid(d); + + if ( iommu_domid == 0 ) + { + spin_lock(&domid_bitmap_lock); + iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size); + set_bit(iommu_domid, domid_bitmap); + spin_unlock(&domid_bitmap_lock); + d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid; + } + + context->hi &= (1 << DID_HIGH_OFFSET) - 1; + context->hi |= iommu_domid << DID_HIGH_OFFSET; +} + +static void iommu_domid_release(struct domain *d) +{ + domid_t iommu_domid = domain_iommu_domid(d); + + if ( iommu_domid != 0 ) + { + d->arch.hvm_domain.hvm_iommu.iommu_domid = 0; + clear_bit(iommu_domid, domid_bitmap); + } +} + +static struct intel_iommu *alloc_intel_iommu(void) +{ + struct intel_iommu *intel; + + intel = xmalloc(struct intel_iommu); + if ( intel == NULL ) + return NULL; + memset(intel, 0, sizeof(struct intel_iommu)); + + return intel; +} + +static void free_intel_iommu(struct intel_iommu *intel) +{ + xfree(intel); +} + +struct iommu_flush *iommu_get_flush(struct iommu *iommu) +{ + return iommu ? &iommu->intel->flush : NULL; +} + +static unsigned int clflush_size; +static int iommus_incoherent; +static void __iommu_flush_cache(void *addr, int size) +{ + int i; + + if ( !iommus_incoherent ) + return; + + for ( i = 0; i < size; i += clflush_size ) + cacheline_flush((char *)addr + i); +} + +void iommu_flush_cache_entry(void *addr) +{ + __iommu_flush_cache(addr, 8); +} + +void iommu_flush_cache_page(void *addr) +{ + __iommu_flush_cache(addr, PAGE_SIZE_4K); +} + +int nr_iommus; +/* context entry handling */ +static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus) +{ + struct root_entry *root, *root_entries; + u64 maddr; + + ASSERT(spin_is_locked(&iommu->lock)); + root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr); + root = &root_entries[bus]; + if ( !root_present(*root) ) + { + maddr = alloc_pgtable_maddr(NULL); + if ( maddr == 0 ) + { + unmap_vtd_domain_page(root_entries); + return 0; + } + set_root_value(*root, maddr); + set_root_present(*root); + iommu_flush_cache_entry(root); + } + maddr = (u64) get_context_addr(*root); + unmap_vtd_domain_page(root_entries); + return maddr; +} + +static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + int addr_width = agaw_to_width(hd->agaw); + struct dma_pte *parent, *pte = NULL; + int level = agaw_to_level(hd->agaw); + int offset; + u64 pte_maddr = 0, maddr; + u64 *vaddr = NULL; + + addr &= (((u64)1) << addr_width) - 1; + ASSERT(spin_is_locked(&hd->mapping_lock)); + if ( hd->pgd_maddr == 0 ) + if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) ) + goto out; + + parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr); + while ( level > 1 ) + { + offset = address_level_offset(addr, level); + pte = &parent[offset]; + + if ( dma_pte_addr(*pte) == 0 ) + { + if ( !alloc ) + break; + maddr = alloc_pgtable_maddr(domain); + if ( !maddr ) + break; + dma_set_pte_addr(*pte, maddr); + vaddr = map_vtd_domain_page(maddr); + + /* + * high level table always sets r/w, last level + * page table control read/write + */ + dma_set_pte_readable(*pte); + dma_set_pte_writable(*pte); + iommu_flush_cache_entry(pte); + } + else + { + vaddr = map_vtd_domain_page(pte->val); + } + + if ( level == 2 ) + { + pte_maddr = pte->val & PAGE_MASK_4K; + unmap_vtd_domain_page(vaddr); + break; + } + + unmap_vtd_domain_page(parent); + parent = (struct dma_pte *)vaddr; + vaddr = NULL; + level--; + } + + unmap_vtd_domain_page(parent); + out: + return pte_maddr; +} + +static void iommu_flush_write_buffer(struct iommu *iommu) +{ + u32 val; + unsigned long flag; + s_time_t start_time; + + if ( !cap_rwbf(iommu->cap) ) + return; + val = iommu->gcmd | DMA_GCMD_WBF; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writel(iommu->reg, DMAR_GCMD_REG, val); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + val = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( !(val & DMA_GSTS_WBFS) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static int flush_context_reg( + void *_iommu, + u16 did, u16 source_id, u8 function_mask, u64 type, + int non_present_entry_flush) +{ + struct iommu *iommu = (struct iommu *) _iommu; + u64 val = 0; + unsigned long flag; + s_time_t start_time; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if ( non_present_entry_flush ) + { + if ( !cap_caching_mode(iommu->cap) ) + return 1; + else + did = 0; + } + + /* use register invalidation */ + switch ( type ) + { + case DMA_CCMD_GLOBAL_INVL: + val = DMA_CCMD_GLOBAL_INVL; + break; + case DMA_CCMD_DOMAIN_INVL: + val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); + break; + case DMA_CCMD_DEVICE_INVL: + val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) + |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask); + break; + default: + BUG(); + } + val |= DMA_CCMD_ICC; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg, DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + val = dmar_readq(iommu->reg, DMAR_CCMD_REG); + if ( !(val & DMA_CCMD_ICC) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flag); + /* flush context entry will implicitly flush write buffer */ + return 0; +} + +static int inline iommu_flush_context_global( + struct iommu *iommu, int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_domain( + struct iommu *iommu, u16 did, int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_device( + struct iommu *iommu, u16 did, u16 source_id, + u8 function_mask, int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->context(iommu, did, source_id, function_mask, + DMA_CCMD_DEVICE_INVL, + non_present_entry_flush); +} + +/* return value determine if we need a write buffer flush */ +static int flush_iotlb_reg(void *_iommu, u16 did, + u64 addr, unsigned int size_order, u64 type, + int non_present_entry_flush) +{ + struct iommu *iommu = (struct iommu *) _iommu; + int tlb_offset = ecap_iotlb_offset(iommu->ecap); + u64 val = 0, val_iva = 0; + unsigned long flag; + s_time_t start_time; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if ( non_present_entry_flush ) + { + if ( !cap_caching_mode(iommu->cap) ) + return 1; + else + did = 0; + } + + /* use register invalidation */ + switch ( type ) + { + case DMA_TLB_GLOBAL_FLUSH: + /* global flush doesn't need set IVA_REG */ + val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; + break; + case DMA_TLB_DSI_FLUSH: + val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + break; + case DMA_TLB_PSI_FLUSH: + val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + /* Note: always flush non-leaf currently */ + val_iva = size_order | addr; + break; + default: + BUG(); + } + /* Note: set drain read/write */ + if ( cap_read_drain(iommu->cap) ) + val |= DMA_TLB_READ_DRAIN; + if ( cap_write_drain(iommu->cap) ) + val |= DMA_TLB_WRITE_DRAIN; + + spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if ( val_iva ) + dmar_writeq(iommu->reg, tlb_offset, val_iva); + dmar_writeq(iommu->reg, tlb_offset + 8, val); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + val = dmar_readq(iommu->reg, tlb_offset + 8); + if ( !(val & DMA_TLB_IVT) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* check IOTLB invalidation granularity */ + if ( DMA_TLB_IAIG(val) == 0 ) + dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n"); + + /* flush iotlb entry will implicitly flush write buffer */ + return 0; +} + +static int inline iommu_flush_iotlb_global(struct iommu *iommu, + int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, + non_present_entry_flush); +} + +static int inline iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did, + int non_present_entry_flush) +{ + struct iommu_flush *flush = iommu_get_flush(iommu); + return flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, + non_present_entry_flush); +} + +static int inline get_alignment(u64 base, unsigned int size) +{ + int t = 0; + u64 end; + + end = base + size - 1; + while ( base != end ) + { + t++; + base >>= 1; + end >>= 1; + } + return t; +} + +static int inline iommu_flush_iotlb_psi( + struct iommu *iommu, u16 did, + u64 addr, unsigned int pages, int non_present_entry_flush) +{ + unsigned int align; + struct iommu_flush *flush = iommu_get_flush(iommu); + + ASSERT(!(addr & (~PAGE_MASK_4K))); + ASSERT(pages > 0); + + /* Fallback to domain selective flush if no PSI support */ + if ( !cap_pgsel_inv(iommu->cap) ) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + /* + * PSI requires page size is 2 ^ x, and the base address is naturally + * aligned to the size + */ + align = get_alignment(addr >> PAGE_SHIFT_4K, pages); + /* Fallback to domain selective flush if size is too big */ + if ( align > cap_max_amask_val(iommu->cap) ) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + addr >>= PAGE_SHIFT_4K + align; + addr <<= PAGE_SHIFT_4K + align; + + return flush->iotlb(iommu, did, addr, align, + DMA_TLB_PSI_FLUSH, non_present_entry_flush); +} + +void iommu_flush_all(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + + flush_all_cache(); + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + iommu_flush_context_global(iommu, 0); + iommu_flush_iotlb_global(iommu, 0); + } +} + +/* clear one page's page table */ +static void dma_pte_clear_one(struct domain *domain, u64 addr) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + struct dma_pte *page = NULL, *pte = NULL; + u64 pg_maddr; + + spin_lock(&hd->mapping_lock); + /* get last level pte */ + pg_maddr = addr_to_dma_page_maddr(domain, addr, 0); + if ( pg_maddr == 0 ) + { + spin_unlock(&hd->mapping_lock); + return; + } + + page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); + pte = page + address_level_offset(addr, 1); + + if ( !dma_pte_present(*pte) ) + { + spin_unlock(&hd->mapping_lock); + unmap_vtd_domain_page(page); + return; + } + + dma_clear_pte(*pte); + spin_unlock(&hd->mapping_lock); + iommu_flush_cache_entry(pte); + + /* No need pcidevs_lock here since do that on assign/deassign device*/ + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( test_bit(iommu->index, &hd->iommu_bitmap) ) + if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain), + addr, 1, 0)) + iommu_flush_write_buffer(iommu); + } + + unmap_vtd_domain_page(page); +} + +static void iommu_free_pagetable(u64 pt_maddr, int level) +{ + int i; + struct dma_pte *pt_vaddr, *pte; + int next_level = level - 1; + + if ( pt_maddr == 0 ) + return; + + pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); + + for ( i = 0; i < PTE_NUM; i++ ) + { + pte = &pt_vaddr[i]; + if ( !dma_pte_present(*pte) ) + continue; + + if ( next_level >= 1 ) + iommu_free_pagetable(dma_pte_addr(*pte), next_level); + + dma_clear_pte(*pte); + iommu_flush_cache_entry(pte); + } + + unmap_vtd_domain_page(pt_vaddr); + free_pgtable_maddr(pt_maddr); +} + +static int iommu_set_root_entry(struct iommu *iommu) +{ + u32 cmd, sts; + unsigned long flags; + s_time_t start_time; + + spin_lock(&iommu->lock); + + if ( iommu->root_maddr == 0 ) + iommu->root_maddr = alloc_pgtable_maddr(NULL); + if ( iommu->root_maddr == 0 ) + { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + spin_unlock(&iommu->lock); + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr); + cmd = iommu->gcmd | DMA_GCMD_SRTP; + dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( sts & DMA_GSTS_RTPS ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + + spin_unlock_irqrestore(&iommu->register_lock, flags); + + return 0; +} + +static void iommu_enable_translation(struct iommu *iommu) +{ + u32 sts; + unsigned long flags; + s_time_t start_time; + + dprintk(XENLOG_INFO VTDPREFIX, + "iommu_enable_translation: iommu->reg = %p\n", iommu->reg); + spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd |= DMA_GCMD_TE; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( sts & DMA_GSTS_TES ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + + /* Disable PMRs when VT-d engine takes effect per spec definition */ + disable_pmr(iommu); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +int iommu_disable_translation(struct iommu *iommu) +{ + u32 sts; + unsigned long flags; + s_time_t start_time; + + spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd &= ~ DMA_GCMD_TE; + dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd); + + /* Make sure hardware complete it */ + start_time = NOW(); + for ( ; ; ) + { + sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); + if ( !(sts & DMA_GSTS_TES) ) + break; + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("%s: DMAR hardware is malfunctional," + " please disable IOMMU\n", __func__); + cpu_relax(); + } + spin_unlock_irqrestore(&iommu->register_lock, flags); + return 0; +} + +static struct iommu *vector_to_iommu[NR_VECTORS]; +static int iommu_page_fault_do_one(struct iommu *iommu, int type, + u8 fault_reason, u16 source_id, u64 addr) +{ + dprintk(XENLOG_WARNING VTDPREFIX, + "iommu_fault:%s: %x:%x.%x addr %"PRIx64" REASON %x " + "iommu->reg = %p\n", + (type ? "DMA Read" : "DMA Write"), (source_id >> 8), + PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr, + fault_reason, iommu->reg); + +#ifndef __i386__ /* map_domain_page() cannot be used in this context */ + if ( fault_reason < 0x20 ) + print_vtd_entries(iommu, (source_id >> 8), + (source_id & 0xff), (addr >> PAGE_SHIFT)); +#endif + + return 0; +} + +static void iommu_fault_status(u32 fault_status) +{ + if ( fault_status & DMA_FSTS_PFO ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Fault Overflow\n"); + if ( fault_status & DMA_FSTS_PPF ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Primary Pending Fault\n"); + if ( fault_status & DMA_FSTS_AFO ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Advanced Fault Overflow\n"); + if ( fault_status & DMA_FSTS_APF ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Advanced Pending Fault\n"); + if ( fault_status & DMA_FSTS_IQE ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Invalidation Queue Error\n"); + if ( fault_status & DMA_FSTS_ICE ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Invalidation Completion Error\n"); + if ( fault_status & DMA_FSTS_ITE ) + dprintk(XENLOG_ERR VTDPREFIX, + "iommu_fault_status: Invalidation Time-out Error\n"); +} + +#define PRIMARY_FAULT_REG_LEN (16) +static void iommu_page_fault(int vector, void *dev_id, + struct cpu_user_regs *regs) +{ + struct iommu *iommu = dev_id; + int reg, fault_index; + u32 fault_status; + unsigned long flags; + + dprintk(XENLOG_WARNING VTDPREFIX, + "iommu_page_fault: iommu->reg = %p\n", iommu->reg); + + fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG); + + iommu_fault_status(fault_status); + + /* FIXME: ignore advanced fault log */ + if ( !(fault_status & DMA_FSTS_PPF) ) + goto clear_overflow; + + fault_index = dma_fsts_fault_record_index(fault_status); + reg = cap_fault_reg_offset(iommu->cap); + while (1) + { + u8 fault_reason; + u16 source_id; + u32 data; + u64 guest_addr; + int type; + + /* highest 32 bits */ + spin_lock_irqsave(&iommu->register_lock, flags); + data = dmar_readl(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + if ( !(data & DMA_FRCD_F) ) + { + spin_unlock_irqrestore(&iommu->register_lock, flags); + break; + } + + fault_reason = dma_frcd_fault_reason(data); + type = dma_frcd_type(data); + + data = dmar_readl(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 8); + source_id = dma_frcd_source_id(data); + + guest_addr = dmar_readq(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN); + guest_addr = dma_frcd_page_addr(guest_addr); + /* clear the fault */ + dmar_writel(iommu->reg, reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F); + spin_unlock_irqrestore(&iommu->register_lock, flags); + + iommu_page_fault_do_one(iommu, type, fault_reason, + source_id, guest_addr); + + fault_index++; + if ( fault_index > cap_num_fault_regs(iommu->cap) ) + fault_index = 0; + } +clear_overflow: + /* clear primary fault overflow */ + fault_status = readl(iommu->reg + DMAR_FSTS_REG); + if ( fault_status & DMA_FSTS_PFO ) + { + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO); + spin_unlock_irqrestore(&iommu->register_lock, flags); + } +} + +static void dma_msi_unmask(unsigned int vector) +{ + struct iommu *iommu = vector_to_iommu[vector]; + unsigned long flags; + + /* unmask it */ + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static void dma_msi_mask(unsigned int vector) +{ + unsigned long flags; + struct iommu *iommu = vector_to_iommu[vector]; + + /* mask it */ + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static unsigned int dma_msi_startup(unsigned int vector) +{ + dma_msi_unmask(vector); + return 0; +} + +static void dma_msi_end(unsigned int vector) +{ + dma_msi_unmask(vector); + ack_APIC_irq(); +} + +static void dma_msi_data_init(struct iommu *iommu, int vector) +{ + u32 msi_data = 0; + unsigned long flags; + + /* Fixed, edge, assert mode. Follow MSI setting */ + msi_data |= vector & 0xff; + msi_data |= 1 << 14; + + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FEDATA_REG, msi_data); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +#ifdef SUPPORT_MSI_REMAPPING +static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu) +{ + u64 msi_address; + unsigned long flags; + + /* Physical, dedicated cpu. Follow MSI setting */ + msi_address = (MSI_ADDRESS_HEADER << (MSI_ADDRESS_HEADER_SHIFT + 8)); + msi_address |= MSI_PHYSICAL_MODE << 2; + msi_address |= MSI_REDIRECTION_HINT_MODE << 3; + msi_address |= phy_cpu << MSI_TARGET_CPU_SHIFT; + + spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32)msi_address); + dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32)(msi_address >> 32)); + spin_unlock_irqrestore(&iommu->register_lock, flags); +} +#else +static void dma_msi_addr_init(struct iommu *iommu, int phy_cpu) +{ + /* ia64: TODO */ +} +#endif + +static void dma_msi_set_affinity(unsigned int vector, cpumask_t dest) +{ + struct iommu *iommu = vector_to_iommu[vector]; + dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(dest))); +} + +static struct hw_interrupt_type dma_msi_type = { + .typename = "DMA_MSI", + .startup = dma_msi_startup, + .shutdown = dma_msi_mask, + .enable = dma_msi_unmask, + .disable = dma_msi_mask, + .ack = dma_msi_mask, + .end = dma_msi_end, + .set_affinity = dma_msi_set_affinity, +}; + +int iommu_set_interrupt(struct iommu *iommu) +{ + int vector, ret; + + vector = assign_irq_vector(AUTO_ASSIGN); + vector_to_iommu[vector] = iommu; + + /* VT-d fault is a MSI, make irq == vector */ + irq_vector[vector] = vector; + vector_irq[vector] = vector; + + if ( !vector ) + { + gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n"); + return -EINVAL; + } + + irq_desc[vector].handler = &dma_msi_type; + ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n"); + return vector; +} + +static int iommu_alloc(struct acpi_drhd_unit *drhd) +{ + struct iommu *iommu; + unsigned long sagaw; + int agaw; + + if ( nr_iommus > MAX_IOMMUS ) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus); + return -ENOMEM; + } + + iommu = xmalloc(struct iommu); + if ( iommu == NULL ) + return -ENOMEM; + memset(iommu, 0, sizeof(struct iommu)); + + iommu->intel = alloc_intel_iommu(); + if ( iommu->intel == NULL ) + { + xfree(iommu); + return -ENOMEM; + } + + iommu->reg = map_to_nocache_virt(nr_iommus, drhd->address); + iommu->index = nr_iommus++; + + iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG); + iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG); + + /* Calculate number of pagetable levels: between 2 and 4. */ + sagaw = cap_sagaw(iommu->cap); + for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- ) + if ( test_bit(agaw, &sagaw) ) + break; + if ( agaw < 0 ) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: unsupported sagaw %lx\n", sagaw); + xfree(iommu); + return -ENODEV; + } + iommu->nr_pt_levels = agaw_to_level(agaw); + + if ( !ecap_coherent(iommu->ecap) ) + iommus_incoherent = 1; + + spin_lock_init(&iommu->lock); + spin_lock_init(&iommu->register_lock); + + drhd->iommu = iommu; + return 0; +} + +static void iommu_free(struct acpi_drhd_unit *drhd) +{ + struct iommu *iommu = drhd->iommu; + + if ( iommu == NULL ) + return; + + if ( iommu->root_maddr != 0 ) + { + free_pgtable_maddr(iommu->root_maddr); + iommu->root_maddr = 0; + } + + if ( iommu->reg ) + iounmap(iommu->reg); + + free_intel_iommu(iommu->intel); + free_irq(iommu->vector); + xfree(iommu); + + drhd->iommu = NULL; +} + +#define guestwidth_to_adjustwidth(gaw) ({ \ + int agaw, r = (gaw - 12) % 9; \ + agaw = (r == 0) ? gaw : (gaw + 9 - r); \ + if ( agaw > 64 ) \ + agaw = 64; \ + agaw; }) + +static int intel_iommu_domain_init(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + u64 i, j, tmp; + struct acpi_drhd_unit *drhd; + + hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + + if ( d->domain_id == 0 ) + { + extern int xen_in_range(paddr_t start, paddr_t end); + + /* + * Set up 1:1 page table for dom0 except the critical segments + * like Xen. + */ + for ( i = 0; i < max_page; i++ ) + { + if ( xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) ) + continue; + + tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K); + for ( j = 0; j < tmp; j++ ) + iommu_map_page(d, (i*tmp+j), (i*tmp+j)); + } + + setup_dom0_devices(d); + setup_dom0_rmrr(d); + + iommu_flush_all(); + + for_each_drhd_unit ( drhd ) + iommu_enable_translation(drhd->iommu); + } + + return 0; +} + +static int domain_context_mapping_one( + struct domain *domain, + struct iommu *iommu, + u8 bus, u8 devfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + struct context_entry *context, *context_entries; + u64 maddr, pgd_maddr; + struct pci_dev *pdev = NULL; + int agaw; + + ASSERT(spin_is_locked(&pcidevs_lock)); + spin_lock(&iommu->lock); + maddr = bus_to_context_maddr(iommu, bus); + context_entries = (struct context_entry *)map_vtd_domain_page(maddr); + context = &context_entries[devfn]; + + if ( context_present(*context) ) + { + int res = 0; + + pdev = pci_get_pdev(bus, devfn); + if (!pdev) + res = -ENODEV; + else if (pdev->domain != domain) + res = -EINVAL; + unmap_vtd_domain_page(context_entries); + spin_unlock(&iommu->lock); + return res; + } + + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) ) + { + context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); + agaw = level_to_agaw(iommu->nr_pt_levels); + } + else + { + spin_lock(&hd->mapping_lock); + + /* Ensure we have pagetables allocated down to leaf PTE. */ + if ( hd->pgd_maddr == 0 ) + { + addr_to_dma_page_maddr(domain, 0, 1); + if ( hd->pgd_maddr == 0 ) + { + nomem: + spin_unlock(&hd->mapping_lock); + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + return -ENOMEM; + } + } + + /* Skip top levels of page tables for 2- and 3-level DRHDs. */ + pgd_maddr = hd->pgd_maddr; + for ( agaw = level_to_agaw(4); + agaw != level_to_agaw(iommu->nr_pt_levels); + agaw-- ) + { + struct dma_pte *p = map_vtd_domain_page(pgd_maddr); + pgd_maddr = dma_pte_addr(*p); + unmap_vtd_domain_page(p); + if ( pgd_maddr == 0 ) + goto nomem; + } + + context_set_address_root(*context, pgd_maddr); + context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); + spin_unlock(&hd->mapping_lock); + } + + /* + * domain_id 0 is not valid on Intel's IOMMU, force domain_id to + * be 1 based as required by intel's iommu hw. + */ + context_set_domain_id(context, domain); + context_set_address_width(*context, agaw); + context_set_fault_enable(*context); + context_set_present(*context); + iommu_flush_cache_entry(context); + spin_unlock(&iommu->lock); + + /* Context entry was previously non-present (with domid 0). */ + if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, 1) ) + iommu_flush_write_buffer(iommu); + else + iommu_flush_iotlb_dsi(iommu, 0, 1); + + set_bit(iommu->index, &hd->iommu_bitmap); + + unmap_vtd_domain_page(context_entries); + + return 0; +} + +#define PCI_BASE_CLASS_BRIDGE 0x06 +#define PCI_CLASS_BRIDGE_PCI 0x0604 + +enum { + DEV_TYPE_PCIe_ENDPOINT, + DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch + DEV_TYPE_PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge + DEV_TYPE_PCI, +}; + +int pdev_type(u8 bus, u8 devfn) +{ + u16 class_device; + u16 status, creg; + int pos; + u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); + + class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE); + if ( class_device == PCI_CLASS_BRIDGE_PCI ) + { + pos = pci_find_next_cap(bus, devfn, + PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP); + if ( !pos ) + return DEV_TYPE_PCI_BRIDGE; + creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS); + return ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == PCI_EXP_TYPE_PCI_BRIDGE ? + DEV_TYPE_PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE; + } + + status = pci_conf_read16(bus, d, f, PCI_STATUS); + if ( !(status & PCI_STATUS_CAP_LIST) ) + return DEV_TYPE_PCI; + + if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) ) + return DEV_TYPE_PCIe_ENDPOINT; + + return DEV_TYPE_PCI; +} + +#define MAX_BUSES 256 +static DEFINE_SPINLOCK(bus2bridge_lock); +static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES]; + +static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus) +{ + int cnt = 0; + *secbus = *bus; + + ASSERT(spin_is_locked(&bus2bridge_lock)); + if ( !bus2bridge[*bus].map ) + return 0; + + while ( bus2bridge[*bus].map ) + { + *secbus = *bus; + *devfn = bus2bridge[*bus].devfn; + *bus = bus2bridge[*bus].bus; + if ( cnt++ >= MAX_BUSES ) + return 0; + } + + return 1; +} + +static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus) +{ + int ret = 0; + + if ( *bus == 0 ) + /* assume integrated PCI devices in RC have valid requester-id */ + return 1; + + spin_lock(&bus2bridge_lock); + ret = _find_pcie_endpoint(bus, devfn, secbus); + spin_unlock(&bus2bridge_lock); + + return ret; +} + +static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn) +{ + struct acpi_drhd_unit *drhd; + int ret = 0; + u16 sec_bus, sub_bus; + u32 type; + u8 secbus; + + drhd = acpi_find_matched_drhd_unit(bus, devfn); + if ( !drhd ) + return -ENODEV; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + type = pdev_type(bus, devfn); + switch ( type ) + { + case DEV_TYPE_PCIe_BRIDGE: + break; + + case DEV_TYPE_PCI_BRIDGE: + sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_SECONDARY_BUS); + sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_SUBORDINATE_BUS); + + spin_lock(&bus2bridge_lock); + for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ ) + { + bus2bridge[sec_bus].map = 1; + bus2bridge[sec_bus].bus = bus; + bus2bridge[sec_bus].devfn = devfn; + } + spin_unlock(&bus2bridge_lock); + break; + + case DEV_TYPE_PCIe_ENDPOINT: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_mapping:PCIe: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn); + break; + + case DEV_TYPE_PCI: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_mapping:PCI: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + break; + + secbus = bus; + /* dependent devices mapping */ + while ( bus2bridge[bus].map ) + { + secbus = bus; + devfn = bus2bridge[bus].devfn; + bus = bus2bridge[bus].bus; + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + return ret; + } + + if ( secbus != bus ) + /* + * The source-id for transactions on non-PCIe buses seem + * to originate from devfn=0 on the secondary bus behind + * the bridge. Map that id as well. The id to use in + * these scanarios is not particularly well documented + * anywhere. + */ + ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0); + break; + + default: + gdprintk(XENLOG_ERR VTDPREFIX, + "domain_context_mapping:unknown type : bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = -EINVAL; + break; + } + + return ret; +} + +static int domain_context_unmap_one( + struct domain *domain, + struct iommu *iommu, + u8 bus, u8 devfn) +{ + struct context_entry *context, *context_entries; + u64 maddr; + + ASSERT(spin_is_locked(&pcidevs_lock)); + spin_lock(&iommu->lock); + + maddr = bus_to_context_maddr(iommu, bus); + context_entries = (struct context_entry *)map_vtd_domain_page(maddr); + context = &context_entries[devfn]; + + if ( !context_present(*context) ) + { + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + return 0; + } + + context_clear_present(*context); + context_clear_entry(*context); + iommu_flush_cache_entry(context); + + if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain), + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, 0) ) + iommu_flush_write_buffer(iommu); + else + iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0); + + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + + return 0; +} + +static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn) +{ + struct acpi_drhd_unit *drhd; + int ret = 0; + u32 type; + u8 secbus; + + drhd = acpi_find_matched_drhd_unit(bus, devfn); + if ( !drhd ) + return -ENODEV; + + type = pdev_type(bus, devfn); + switch ( type ) + { + case DEV_TYPE_PCIe_BRIDGE: + case DEV_TYPE_PCI_BRIDGE: + break; + + case DEV_TYPE_PCIe_ENDPOINT: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_unmap:PCIe: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn); + break; + + case DEV_TYPE_PCI: + gdprintk(XENLOG_INFO VTDPREFIX, + "domain_context_unmap:PCI: bdf = %x:%x.%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + break; + + secbus = bus; + /* dependent devices unmapping */ + while ( bus2bridge[bus].map ) + { + secbus = bus; + devfn = bus2bridge[bus].devfn; + bus = bus2bridge[bus].bus; + ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn); + if ( ret ) + return ret; + } + + if ( bus != secbus ) + ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0); + break; + + default: + gdprintk(XENLOG_ERR VTDPREFIX, + "domain_context_unmap:unknown type: bdf = %x:%x:%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = -EINVAL; + break; + } + + return ret; +} + +static int reassign_device_ownership( + struct domain *source, + struct domain *target, + u8 bus, u8 devfn) +{ + struct hvm_iommu *source_hd = domain_hvm_iommu(source); + struct pci_dev *pdev; + struct acpi_drhd_unit *drhd; + struct iommu *pdev_iommu; + int ret, found = 0; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev_by_domain(source, bus, devfn); + + if (!pdev) + return -ENODEV; + + drhd = acpi_find_matched_drhd_unit(bus, devfn); + pdev_iommu = drhd->iommu; + domain_context_unmap(source, bus, devfn); + + ret = domain_context_mapping(target, bus, devfn); + if ( ret ) + return ret; + + list_move(&pdev->domain_list, &target->arch.pdev_list); + pdev->domain = target; + + for_each_pdev ( source, pdev ) + { + drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn); + if ( drhd->iommu == pdev_iommu ) + { + found = 1; + break; + } + } + + if ( !found ) + clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap); + + return ret; +} + +void iommu_domain_teardown(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + + if ( list_empty(&acpi_drhd_units) ) + return; + + spin_lock(&hd->mapping_lock); + iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw)); + hd->pgd_maddr = 0; + spin_unlock(&hd->mapping_lock); + + iommu_domid_release(d); +} + +int intel_iommu_map_page( + struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + struct dma_pte *page = NULL, *pte = NULL; + u64 pg_maddr; + int pte_present; + + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + /* do nothing if dom0 and iommu supports pass thru */ + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) + return 0; + + spin_lock(&hd->mapping_lock); + + pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1); + if ( pg_maddr == 0 ) + { + spin_unlock(&hd->mapping_lock); + return -ENOMEM; + } + page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); + pte = page + (gfn & LEVEL_MASK); + pte_present = dma_pte_present(*pte); + dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K); + dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE); + iommu_flush_cache_entry(pte); + spin_unlock(&hd->mapping_lock); + unmap_vtd_domain_page(page); + + /* + * No need pcideves_lock here because we have flush + * when assign/deassign device + */ + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + + if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) + continue; + + if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(d), + (paddr_t)gfn << PAGE_SHIFT_4K, 1, + !pte_present) ) + iommu_flush_write_buffer(iommu); + } + + return 0; +} + +int intel_iommu_unmap_page(struct domain *d, unsigned long gfn) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + /* do nothing if dom0 and iommu supports pass thru */ + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) + return 0; + + dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K); + + return 0; +} + +static int iommu_prepare_rmrr_dev(struct domain *d, + struct acpi_rmrr_unit *rmrr, + u8 bus, u8 devfn) +{ + int ret = 0; + u64 base, end; + unsigned long base_pfn, end_pfn; + + ASSERT(spin_is_locked(&pcidevs_lock)); + ASSERT(rmrr->base_address < rmrr->end_address); + + base = rmrr->base_address & PAGE_MASK_4K; + base_pfn = base >> PAGE_SHIFT_4K; + end = PAGE_ALIGN_4K(rmrr->end_address); + end_pfn = end >> PAGE_SHIFT_4K; + + while ( base_pfn < end_pfn ) + { + intel_iommu_map_page(d, base_pfn, base_pfn); + base_pfn++; + } + + ret = domain_context_mapping(d, bus, devfn); + + return ret; +} + +static int intel_iommu_add_device(struct pci_dev *pdev) +{ + struct acpi_rmrr_unit *rmrr; + u16 bdf; + int ret, i; + + ASSERT(spin_is_locked(&pcidevs_lock)); + + if ( !pdev->domain ) + return -EINVAL; + + ret = domain_context_mapping(pdev->domain, pdev->bus, pdev->devfn); + if ( ret ) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "intel_iommu_add_device: context mapping failed\n"); + return ret; + } + + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == pdev->devfn ) + { + ret = iommu_prepare_rmrr_dev(pdev->domain, rmrr, + pdev->bus, pdev->devfn); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, + "intel_iommu_add_device: RMRR mapping failed\n"); + break; + } + } + + return ret; +} + +static int intel_iommu_remove_device(struct pci_dev *pdev) +{ + struct acpi_rmrr_unit *rmrr; + u16 bdf; + int i; + + if ( !pdev->domain ) + return -EINVAL; + + /* If the device belongs to dom0, and it has RMRR, don't remove it + * from dom0, because BIOS may use RMRR at booting time. + */ + if ( pdev->domain->domain_id == 0 ) + { + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( PCI_BUS(bdf) == pdev->bus && + PCI_DEVFN2(bdf) == pdev->devfn ) + return 0; + } + } + + return domain_context_unmap(pdev->domain, pdev->bus, pdev->devfn); +} + +static void setup_dom0_devices(struct domain *d) +{ + struct hvm_iommu *hd; + struct pci_dev *pdev; + int bus, dev, func; + u32 l; + + hd = domain_hvm_iommu(d); + + spin_lock(&pcidevs_lock); + for ( bus = 0; bus < 256; bus++ ) + { + for ( dev = 0; dev < 32; dev++ ) + { + for ( func = 0; func < 8; func++ ) + { + l = pci_conf_read32(bus, dev, func, PCI_VENDOR_ID); + /* some broken boards return 0 or ~0 if a slot is empty: */ + if ( (l == 0xffffffff) || (l == 0x00000000) || + (l == 0x0000ffff) || (l == 0xffff0000) ) + continue; + + pdev = alloc_pdev(bus, PCI_DEVFN(dev, func)); + pdev->domain = d; + list_add(&pdev->domain_list, &d->arch.pdev_list); + domain_context_mapping(d, pdev->bus, pdev->devfn); + } + } + } + spin_unlock(&pcidevs_lock); +} + +void clear_fault_bits(struct iommu *iommu) +{ + u64 val; + + val = dmar_readq( + iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+0x8); + dmar_writeq( + iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg,DMAR_CAP_REG))+8, + val); + dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS); +} + +static int init_vtd_hw(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + struct iommu_flush *flush = NULL; + int vector; + int ret; + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + ret = iommu_set_root_entry(iommu); + if ( ret ) + { + gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n"); + return -EIO; + } + + vector = iommu_set_interrupt(iommu); + dma_msi_data_init(iommu, vector); + dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map))); + iommu->vector = vector; + clear_fault_bits(iommu); + dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); + + /* initialize flush functions */ + flush = iommu_get_flush(iommu); + flush->context = flush_context_reg; + flush->iotlb = flush_iotlb_reg; + } + + return 0; +} + +static void setup_dom0_rmrr(struct domain *d) +{ + struct acpi_rmrr_unit *rmrr; + u16 bdf; + int ret, i; + + spin_lock(&pcidevs_lock); + for_each_rmrr_device ( rmrr, bdf, i ) + { + ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf)); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: mapping reserved region failed\n"); + } + spin_unlock(&pcidevs_lock); +} + +int intel_vtd_setup(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + + if ( !vtd_enabled ) + return -ENODEV; + + spin_lock_init(&domid_bitmap_lock); + clflush_size = get_cache_line_size(); + + for_each_drhd_unit ( drhd ) + if ( iommu_alloc(drhd) != 0 ) + goto error; + + /* Allocate IO page directory page for the domain. */ + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + + /* Allocate domain id bitmap, and set bit 0 as reserved */ + domid_bitmap_size = cap_ndoms(iommu->cap); + domid_bitmap = xmalloc_array(unsigned long, + BITS_TO_LONGS(domid_bitmap_size)); + if ( domid_bitmap == NULL ) + goto error; + memset(domid_bitmap, 0, domid_bitmap_size / 8); + set_bit(0, domid_bitmap); + + if ( init_vtd_hw() ) + goto error; + + return 0; + + error: + for_each_drhd_unit ( drhd ) + iommu_free(drhd); + vtd_enabled = 0; + return -ENOMEM; +} + +/* + * If the device isn't owned by dom0, it means it already + * has been assigned to other domain, or it's not exist. + */ +int device_assigned(u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + + spin_lock(&pcidevs_lock); + pdev = pci_get_pdev_by_domain(dom0, bus, devfn); + if (!pdev) + { + spin_unlock(&pcidevs_lock); + return -1; + } + + spin_unlock(&pcidevs_lock); + return 0; +} + +int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn) +{ + struct acpi_rmrr_unit *rmrr; + int ret = 0, i; + struct pci_dev *pdev; + u16 bdf; + + if ( list_empty(&acpi_drhd_units) ) + return -ENODEV; + + ASSERT(spin_is_locked(&pcidevs_lock)); + pdev = pci_get_pdev(bus, devfn); + if (!pdev) + return -ENODEV; + + if (pdev->domain != dom0) + { + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: assign a assigned device\n"); + return -EBUSY; + } + + ret = reassign_device_ownership(dom0, d, bus, devfn); + if ( ret ) + goto done; + + /* Setup rmrr identity mapping */ + for_each_rmrr_device( rmrr, bdf, i ) + { + if ( PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn ) + { + /* FIXME: Because USB RMRR conflicts with guest bios region, + * ignore USB RMRR temporarily. + */ + if ( is_usb_device(bus, devfn) ) + { + ret = 0; + goto done; + } + + ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn); + if ( ret ) + gdprintk(XENLOG_ERR VTDPREFIX, + "IOMMU: mapping reserved region failed\n"); + goto done; + } + } + +done: + return ret; +} + +static int intel_iommu_group_id(u8 bus, u8 devfn) +{ + u8 secbus; + if ( !bus2bridge[bus].map || find_pcie_endpoint(&bus, &devfn, &secbus) ) + return PCI_BDF2(bus, devfn); + else + return -1; +} + +static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS]; +void iommu_suspend(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + u32 i; + + if ( !vtd_enabled ) + return; + + iommu_flush_all(); + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + i = iommu->index; + + iommu_state[i][DMAR_FECTL_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG); + iommu_state[i][DMAR_FEDATA_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG); + iommu_state[i][DMAR_FEADDR_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG); + iommu_state[i][DMAR_FEUADDR_REG] = + (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG); + } +} + +void iommu_resume(void) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu; + u32 i; + + if ( !vtd_enabled ) + return; + + iommu_flush_all(); + + if ( init_vtd_hw() != 0 && force_iommu ) + panic("IOMMU setup failed, crash Xen for security purpose!\n"); + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + i = iommu->index; + + dmar_writel(iommu->reg, DMAR_FECTL_REG, + (u32) iommu_state[i][DMAR_FECTL_REG]); + dmar_writel(iommu->reg, DMAR_FEDATA_REG, + (u32) iommu_state[i][DMAR_FEDATA_REG]); + dmar_writel(iommu->reg, DMAR_FEADDR_REG, + (u32) iommu_state[i][DMAR_FEADDR_REG]); + dmar_writel(iommu->reg, DMAR_FEUADDR_REG, + (u32) iommu_state[i][DMAR_FEUADDR_REG]); + iommu_enable_translation(iommu); + } +} + +struct iommu_ops intel_iommu_ops = { + .init = intel_iommu_domain_init, + .add_device = intel_iommu_add_device, + .remove_device = intel_iommu_remove_device, + .assign_device = intel_iommu_assign_device, + .teardown = iommu_domain_teardown, + .map_page = intel_iommu_map_page, + .unmap_page = intel_iommu_unmap_page, + .reassign_device = reassign_device_ownership, + .get_device_group_id = intel_iommu_group_id, +}; + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/drivers/passthrough/vtd/iommu.h b/drivers/passthrough/vtd/iommu.h new file mode 100644 index 0000000..73c1657 --- /dev/null +++ b/drivers/passthrough/vtd/iommu.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + */ + +#ifndef _INTEL_IOMMU_H_ +#define _INTEL_IOMMU_H_ + +#include <xen/types.h> + +/* + * Intel IOMMU register specification per version 1.0 public spec. + */ + +#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_GCMD_REG 0x18 /* Global command register */ +#define DMAR_GSTS_REG 0x1c /* Global status register */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_CCMD_REG 0x28 /* Context command reg */ +#define DMAR_FSTS_REG 0x34 /* Fault Status register */ +#define DMAR_FECTL_REG 0x38 /* Fault control register */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ +#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ +#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ +#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ +#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ +#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ +#define DMAR_IQH_REG 0x80 /* invalidation queue head */ +#define DMAR_IQT_REG 0x88 /* invalidation queue tail */ +#define DMAR_IQA_REG 0x90 /* invalidation queue addr */ +#define DMAR_IRTA_REG 0xB8 /* intr remap */ + +#define OFFSET_STRIDE (9) +#define dmar_readl(dmar, reg) readl(dmar + reg) +#define dmar_writel(dmar, reg, val) writel(val, dmar + reg) +#define dmar_readq(dmar, reg) ({ \ + u32 lo, hi; \ + lo = dmar_readl(dmar, reg); \ + hi = dmar_readl(dmar, reg + 4); \ + (((u64) hi) << 32) + lo; }) +#define dmar_writeq(dmar, reg, val) do {\ + dmar_writel(dmar, reg, (u32)val); \ + dmar_writel(dmar, reg + 4, (u32)((u64) val >> 32)); \ + } while (0) + +#define VER_MAJOR(v) (((v) & 0xf0) >> 4) +#define VER_MINOR(v) ((v) & 0x0f) + +/* + * Decoding Capability Register + */ +#define cap_read_drain(c) (((c) >> 55) & 1) +#define cap_write_drain(c) (((c) >> 54) & 1) +#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) +#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) +#define cap_pgsel_inv(c) (((c) >> 39) & 1) + +#define cap_super_page_val(c) (((c) >> 34) & 0xf) +#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ + * OFFSET_STRIDE) + 21) + +#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) + +#define cap_isoch(c) (((c) >> 23) & 1) +#define cap_qos(c) (((c) >> 22) & 1) +#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) +#define cap_sagaw(c) (((c) >> 8) & 0x1f) +#define cap_caching_mode(c) (((c) >> 7) & 1) +#define cap_phmr(c) (((c) >> 6) & 1) +#define cap_plmr(c) (((c) >> 5) & 1) +#define cap_rwbf(c) (((c) >> 4) & 1) +#define cap_afl(c) (((c) >> 3) & 1) +#define cap_ndoms(c) (1 << (4 + 2 * ((c) & 0x7))) + +/* + * Extended Capability Register + */ + +#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) +#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) +#define ecap_coherent(e) ((e >> 0) & 0x1) +#define ecap_queued_inval(e) ((e >> 1) & 0x1) +#define ecap_dev_iotlb(e) ((e >> 2) & 0x1) +#define ecap_intr_remap(e) ((e >> 3) & 0x1) +#define ecap_ext_intr(e) ((e >> 4) & 0x1) +#define ecap_cache_hints(e) ((e >> 5) & 0x1) +#define ecap_pass_thru(e) ((e >> 6) & 0x1) + +/* IOTLB_REG */ +#define DMA_TLB_FLUSH_GRANU_OFFSET 60 +#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) +#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) +#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) +#define DMA_TLB_IIRG(x) (((x) >> 60) & 7) +#define DMA_TLB_IAIG(val) (((val) >> 57) & 7) +#define DMA_TLB_DID(x) (((u64)(x & 0xffff)) << 32) + +#define DMA_TLB_READ_DRAIN (((u64)1) << 49) +#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) +#define DMA_TLB_IVT (((u64)1) << 63) + +#define DMA_TLB_IVA_ADDR(x) ((((u64)x) >> 12) << 12) +#define DMA_TLB_IVA_HINT(x) ((((u64)x) & 1) << 6) + +/* GCMD_REG */ +#define DMA_GCMD_TE (((u64)1) << 31) +#define DMA_GCMD_SRTP (((u64)1) << 30) +#define DMA_GCMD_SFL (((u64)1) << 29) +#define DMA_GCMD_EAFL (((u64)1) << 28) +#define DMA_GCMD_WBF (((u64)1) << 27) +#define DMA_GCMD_QIE (((u64)1) << 26) +#define DMA_GCMD_IRE (((u64)1) << 25) +#define DMA_GCMD_SIRTP (((u64)1) << 24) +#define DMA_GCMD_CFI (((u64)1) << 23) + +/* GSTS_REG */ +#define DMA_GSTS_TES (((u64)1) << 31) +#define DMA_GSTS_RTPS (((u64)1) << 30) +#define DMA_GSTS_FLS (((u64)1) << 29) +#define DMA_GSTS_AFLS (((u64)1) << 28) +#define DMA_GSTS_WBFS (((u64)1) << 27) +#define DMA_GSTS_QIES (((u64)1) <<26) +#define DMA_GSTS_IRES (((u64)1) <<25) +#define DMA_GSTS_SIRTPS (((u64)1) << 24) +#define DMA_GSTS_CFIS (((u64)1) <<23) + +/* PMEN_REG */ +#define DMA_PMEN_EPM (((u32)1) << 31) +#define DMA_PMEN_PRS (((u32)1) << 0) + +/* CCMD_REG */ +#define DMA_CCMD_INVL_GRANU_OFFSET 61 +#define DMA_CCMD_ICC (((u64)1) << 63) +#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) +#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) +#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) +#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) +#define DMA_CCMD_CIRG(x) ((((u64)3) << 61) & x) +#define DMA_CCMD_MASK_NOBIT 0 +#define DMA_CCMD_MASK_1BIT 1 +#define DMA_CCMD_MASK_2BIT 2 +#define DMA_CCMD_MASK_3BIT 3 +#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) +#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) + +#define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59)) + +/* FECTL_REG */ +#define DMA_FECTL_IM (((u64)1) << 31) + +/* FSTS_REG */ +#define DMA_FSTS_PFO ((u64)1 << 0) +#define DMA_FSTS_PPF ((u64)1 << 1) +#define DMA_FSTS_AFO ((u64)1 << 2) +#define DMA_FSTS_APF ((u64)1 << 3) +#define DMA_FSTS_IQE ((u64)1 << 4) +#define DMA_FSTS_ICE ((u64)1 << 5) +#define DMA_FSTS_ITE ((u64)1 << 6) +#define DMA_FSTS_FAULTS DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE +#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) + +/* FRCD_REG, 32 bits access */ +#define DMA_FRCD_F (((u64)1) << 31) +#define dma_frcd_type(d) ((d >> 30) & 1) +#define dma_frcd_fault_reason(c) (c & 0xff) +#define dma_frcd_source_id(c) (c & 0xffff) +#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ + +/* + * 0: Present + * 1-11: Reserved + * 12-63: Context Ptr (12 - (haw-1)) + * 64-127: Reserved + */ +struct root_entry { + u64 val; + u64 rsvd1; +}; +#define root_present(root) ((root).val & 1) +#define set_root_present(root) do {(root).val |= 1;} while(0) +#define get_context_addr(root) ((root).val & PAGE_MASK_4K) +#define set_root_value(root, value) \ + do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) + +struct context_entry { + u64 lo; + u64 hi; +}; +#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) +#define context_present(c) ((c).lo & 1) +#define context_fault_disable(c) (((c).lo >> 1) & 1) +#define context_translation_type(c) (((c).lo >> 2) & 3) +#define context_address_root(c) ((c).lo & PAGE_MASK_4K) +#define context_address_width(c) ((c).hi & 7) +#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) + +#define context_set_present(c) do {(c).lo |= 1;} while(0) +#define context_clear_present(c) do {(c).lo &= ~1;} while(0) +#define context_set_fault_enable(c) \ + do {(c).lo &= (((u64)-1) << 2) | 1;} while(0) + +#define context_set_translation_type(c, val) do { \ + (c).lo &= (((u64)-1) << 4) | 3; \ + (c).lo |= (val & 3) << 2; \ + } while(0) +#define CONTEXT_TT_MULTI_LEVEL 0 +#define CONTEXT_TT_DEV_IOTLB 1 +#define CONTEXT_TT_PASS_THRU 2 + +#define context_set_address_root(c, val) \ + do {(c).lo &= 0xfff; (c).lo |= (val) & PAGE_MASK_4K ;} while(0) +#define context_set_address_width(c, val) \ + do {(c).hi &= 0xfffffff8; (c).hi |= (val) & 7;} while(0) +#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while(0) + +/* page table handling */ +#define LEVEL_STRIDE (9) +#define LEVEL_MASK ((1 << LEVEL_STRIDE) - 1) +#define PTE_NUM (1 << LEVEL_STRIDE) +#define level_to_agaw(val) ((val) - 2) +#define agaw_to_level(val) ((val) + 2) +#define agaw_to_width(val) (30 + val * LEVEL_STRIDE) +#define width_to_agaw(w) ((w - 30)/LEVEL_STRIDE) +#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE) +#define address_level_offset(addr, level) \ + ((addr >> level_to_offset_bits(level)) & LEVEL_MASK) +#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l)) +#define level_size(l) (1 << level_to_offset_bits(l)) +#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l)) + +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-11: available + * 12-63: Host physcial address + */ +struct dma_pte { + u64 val; +}; +#define DMA_PTE_READ (1) +#define DMA_PTE_WRITE (2) +#define dma_clear_pte(p) do {(p).val = 0;} while(0) +#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0) +#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0) +#define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0) +#define dma_set_pte_prot(p, prot) \ + do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) +#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) +#define dma_set_pte_addr(p, addr) do {\ + (p).val |= ((addr) & PAGE_MASK_4K); } while (0) +#define dma_pte_present(p) (((p).val & 3) != 0) + +struct poll_info { + u64 saddr; + u32 udata; +}; + +#define VTD_PAGE_TABLE_LEVEL_3 3 +#define VTD_PAGE_TABLE_LEVEL_4 4 + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 +#define MAX_IOMMU_REGS 0xc0 + +extern struct list_head acpi_drhd_units; +extern struct list_head acpi_rmrr_units; +extern struct list_head acpi_ioapic_units; + +struct iommu_flush { + int (*context)(void *iommu, u16 did, u16 source_id, + u8 function_mask, u64 type, int non_present_entry_flush); + int (*iotlb)(void *iommu, u16 did, u64 addr, unsigned int size_order, + u64 type, int non_present_entry_flush); +}; + +struct intel_iommu { + struct iommu_flush flush; +}; + +#endif diff --git a/drivers/passthrough/vtd/utils.c b/drivers/passthrough/vtd/utils.c new file mode 100644 index 0000000..bf9fb6b --- /dev/null +++ b/drivers/passthrough/vtd/utils.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + */ + +#include <xen/sched.h> +#include <xen/delay.h> +#include <xen/iommu.h> +#include <xen/time.h> +#include <xen/pci.h> +#include <xen/pci_regs.h> +#include "iommu.h" +#include "dmar.h" +#include "vtd.h" +#include "extern.h" + +int is_usb_device(u8 bus, u8 devfn) +{ + u16 class = pci_conf_read16(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_CLASS_DEVICE); + return (class == 0xc03); +} + +/* Disable vt-d protected memory registers. */ +void disable_pmr(struct iommu *iommu) +{ + s_time_t start_time; + unsigned int val; + + val = dmar_readl(iommu->reg, DMAR_PMEN_REG); + if ( !(val & DMA_PMEN_PRS) ) + return; + + dmar_writel(iommu->reg, DMAR_PMEN_REG, val & ~DMA_PMEN_EPM); + start_time = NOW(); + + for ( ; ; ) + { + val = dmar_readl(iommu->reg, DMAR_PMEN_REG); + if ( (val & DMA_PMEN_PRS) == 0 ) + break; + + if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) + panic("Disable PMRs timeout\n"); + + cpu_relax(); + } + + dprintk(XENLOG_INFO VTDPREFIX, + "Disabled protected memory registers\n"); +} + +void print_iommu_regs(struct acpi_drhd_unit *drhd) +{ + struct iommu *iommu = drhd->iommu; + + printk("---- print_iommu_regs ----\n"); + printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address); + printk("print_iommu_regs: DMAR_VER_REG = %x\n", + dmar_readl(iommu->reg,DMAR_VER_REG)); + printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_CAP_REG)); + printk("print_iommu_regs: n_fault_reg = %"PRIx64"\n", + cap_num_fault_regs(dmar_readq(iommu->reg, DMAR_CAP_REG))); + printk("print_iommu_regs: fault_recording_offset_l = %"PRIx64"\n", + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG))); + printk("print_iommu_regs: fault_recording_offset_h = %"PRIx64"\n", + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8); + printk("print_iommu_regs: fault_recording_reg_l = %"PRIx64"\n", + dmar_readq(iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)))); + printk("print_iommu_regs: fault_recording_reg_h = %"PRIx64"\n", + dmar_readq(iommu->reg, + cap_fault_reg_offset(dmar_readq(iommu->reg, DMAR_CAP_REG)) + 8)); + printk("print_iommu_regs: DMAR_ECAP_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_ECAP_REG)); + printk("print_iommu_regs: DMAR_GCMD_REG = %x\n", + dmar_readl(iommu->reg,DMAR_GCMD_REG)); + printk("print_iommu_regs: DMAR_GSTS_REG = %x\n", + dmar_readl(iommu->reg,DMAR_GSTS_REG)); + printk("print_iommu_regs: DMAR_RTADDR_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_RTADDR_REG)); + printk("print_iommu_regs: DMAR_CCMD_REG = %"PRIx64"\n", + dmar_readq(iommu->reg,DMAR_CCMD_REG)); + printk("print_iommu_regs: DMAR_FSTS_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FSTS_REG)); + printk("print_iommu_regs: DMAR_FECTL_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FECTL_REG)); + printk("print_iommu_regs: DMAR_FEDATA_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FEDATA_REG)); + printk("print_iommu_regs: DMAR_FEADDR_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FEADDR_REG)); + printk("print_iommu_regs: DMAR_FEUADDR_REG = %x\n", + dmar_readl(iommu->reg,DMAR_FEUADDR_REG)); +} + +u32 get_level_index(unsigned long gmfn, int level) +{ + while ( --level ) + gmfn = gmfn >> LEVEL_STRIDE; + + return gmfn & LEVEL_MASK; +} + +void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn) +{ + struct context_entry *ctxt_entry; + struct root_entry *root_entry; + struct dma_pte pte; + u64 *l; + u32 l_index, level; + + printk("print_vtd_entries: iommu = %p bdf = %x:%x:%x gmfn = %"PRIx64"\n", + iommu, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), gmfn); + + if ( iommu->root_maddr == 0 ) + { + printk(" iommu->root_maddr = 0\n"); + return; + } + + root_entry = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr); + + printk(" root_entry = %p\n", root_entry); + printk(" root_entry[%x] = %"PRIx64"\n", bus, root_entry[bus].val); + if ( !root_present(root_entry[bus]) ) + { + unmap_vtd_domain_page(root_entry); + printk(" root_entry[%x] not present\n", bus); + return; + } + + ctxt_entry = + (struct context_entry *)map_vtd_domain_page(root_entry[bus].val); + if ( ctxt_entry == NULL ) + { + unmap_vtd_domain_page(root_entry); + printk(" ctxt_entry == NULL\n"); + return; + } + + printk(" context = %p\n", ctxt_entry); + printk(" context[%x] = %"PRIx64"_%"PRIx64"\n", + devfn, ctxt_entry[devfn].hi, ctxt_entry[devfn].lo); + if ( !context_present(ctxt_entry[devfn]) ) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk(" ctxt_entry[%x] not present\n", devfn); + return; + } + + level = agaw_to_level(context_address_width(ctxt_entry[devfn])); + if ( level != VTD_PAGE_TABLE_LEVEL_3 && + level != VTD_PAGE_TABLE_LEVEL_4) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk("Unsupported VTD page table level (%d)!\n", level); + } + + l = maddr_to_virt(ctxt_entry[devfn].lo); + do + { + l = (u64*)(((unsigned long)l >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K); + printk(" l%d = %p\n", level, l); + if ( l == NULL ) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk(" l%d == NULL\n", level); + break; + } + l_index = get_level_index(gmfn, level); + printk(" l%d_index = %x\n", level, l_index); + printk(" l%d[%x] = %"PRIx64"\n", level, l_index, l[l_index]); + + pte.val = l[l_index]; + if ( !dma_pte_present(pte) ) + { + unmap_vtd_domain_page(ctxt_entry); + unmap_vtd_domain_page(root_entry); + printk(" l%d[%x] not present\n", level, l_index); + break; + } + + l = maddr_to_virt(l[l_index]); + } while ( --level ); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/drivers/passthrough/vtd/vtd.h b/drivers/passthrough/vtd/vtd.h new file mode 100644 index 0000000..ec02d12 --- /dev/null +++ b/drivers/passthrough/vtd/vtd.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Weidong Han <weidong.han@intel.com> + */ + +#ifndef _VTD_H_ +#define _VTD_H_ + +#include <xen/iommu.h> + +/* Accomodate both IOAPIC and IOSAPIC. */ +struct IO_xAPIC_route_entry { + __u32 vector : 8, + delivery_mode : 3, /* 000: FIXED + * 001: lowest prio + * 111: ExtINT + */ + dest_mode : 1, /* 0: physical, 1: logical */ + delivery_status : 1, + polarity : 1, + irr : 1, + trigger : 1, /* 0: edge, 1: level */ + mask : 1, /* 0: enabled, 1: disabled */ + __reserved_2 : 15; + + union { + struct { __u32 + __reserved_1 : 24, + physical_dest : 4, + __reserved_2 : 4; + } physical; + + struct { __u32 + __reserved_1 : 24, + logical_dest : 8; + } logical; + +#ifdef __ia64__ + struct { __u32 + __reserved_1 : 16, + dest_id : 16; + }; +#endif + } dest; + +} __attribute__ ((packed)); + +struct IO_APIC_route_remap_entry { + union { + u64 val; + struct { + u64 vector:8, + delivery_mode:3, + index_15:1, + delivery_status:1, + polarity:1, + irr:1, + trigger:1, + mask:1, + reserved:31, + format:1, + index_0_14:15; + }; + }; +}; + +struct msi_msg_remap_entry { + union { + u32 val; + struct { + u32 dontcare:2, + index_15:1, + SHV:1, + format:1, + index_0_14:15, + addr_id_val:12; /* Interrupt address identifier value, + must be 0FEEh */ + }; + } address_lo; /* low 32 bits of msi message address */ + + u32 address_hi; /* high 32 bits of msi message address */ + u32 data; /* msi message data */ +}; + +unsigned int get_cache_line_size(void); +void cacheline_flush(char *); +void flush_all_cache(void); +void *map_to_nocache_virt(int nr_iommus, u64 maddr); +u64 alloc_pgtable_maddr(struct domain *d); +void free_pgtable_maddr(u64 maddr); +void *map_vtd_domain_page(u64 maddr); +void unmap_vtd_domain_page(void *va); + +void iommu_flush_cache_entry(void *addr); +void iommu_flush_cache_page(void *addr); + +#endif // _VTD_H_ diff --git a/drivers/passthrough/vtd/x86/Makefile b/drivers/passthrough/vtd/x86/Makefile new file mode 100644 index 0000000..85243e3 --- /dev/null +++ b/drivers/passthrough/vtd/x86/Makefile @@ -0,0 +1 @@ +obj-y += vtd.o diff --git a/drivers/passthrough/vtd/x86/vtd.c b/drivers/passthrough/vtd/x86/vtd.c new file mode 100644 index 0000000..352684a --- /dev/null +++ b/drivers/passthrough/vtd/x86/vtd.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Allen Kay <allen.m.kay@intel.com> + * Copyright (C) Weidong Han <weidong.han@intel.com> + */ + +#include <xen/sched.h> +#include <xen/domain_page.h> +#include <asm/paging.h> +#include <xen/iommu.h> +#include <xen/numa.h> +#include "../iommu.h" +#include "../dmar.h" +#include "../vtd.h" + +void *map_vtd_domain_page(u64 maddr) +{ + return map_domain_page(maddr >> PAGE_SHIFT_4K); +} + +void unmap_vtd_domain_page(void *va) +{ + unmap_domain_page(va); +} + +/* Allocate page table, return its machine address */ +u64 alloc_pgtable_maddr(struct domain *d) +{ + struct page_info *pg; + u64 *vaddr; + unsigned long mfn; + + pg = alloc_domheap_page(NULL); + if ( !pg ) + return 0; + mfn = page_to_mfn(pg); + vaddr = map_domain_page(mfn); + memset(vaddr, 0, PAGE_SIZE); + + iommu_flush_cache_page(vaddr); + unmap_domain_page(vaddr); + + return (u64)mfn << PAGE_SHIFT_4K; +} + +void free_pgtable_maddr(u64 maddr) +{ + if ( maddr != 0 ) + free_domheap_page(maddr_to_page(maddr)); +} + +unsigned int get_cache_line_size(void) +{ + return ((cpuid_ebx(1) >> 8) & 0xff) * 8; +} + +void cacheline_flush(char * addr) +{ + clflush(addr); +} + +void flush_all_cache() +{ + wbinvd(); +} + +void *map_to_nocache_virt(int nr_iommus, u64 maddr) +{ + set_fixmap_nocache(FIX_IOMMU_REGS_BASE_0 + nr_iommus, maddr); + return (void *)fix_to_virt(FIX_IOMMU_REGS_BASE_0 + nr_iommus); +} + +struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain) +{ + if ( !domain ) + return NULL; + + return domain->arch.hvm_domain.irq.dpci; +} + +int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci) +{ + if ( !domain || !dpci ) + return 0; + + domain->arch.hvm_domain.irq.dpci = dpci; + return 1; +} + +void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq) +{ + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + struct hvm_irq_dpci *dpci = NULL; + struct dev_intx_gsi_link *digl, *tmp; + int i; + + ASSERT(isairq < NR_ISAIRQS); + if ( !vtd_enabled) + return; + + spin_lock(&d->evtchn_lock); + + dpci = domain_get_irq_dpci(d); + + if ( !dpci || !test_bit(isairq, dpci->isairq_map) ) + { + spin_unlock(&d->evtchn_lock); + return; + } + /* Multiple mirq may be mapped to one isa irq */ + for ( i = find_first_bit(dpci->mapping, NR_IRQS); + i < NR_IRQS; + i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) ) + { + list_for_each_entry_safe ( digl, tmp, + &dpci->mirq[i].digl_list, list ) + { + if ( hvm_irq->pci_link.route[digl->link] == isairq ) + { + hvm_pci_intx_deassert(d, digl->device, digl->intx); + if ( --dpci->mirq[i].pending == 0 ) + { + stop_timer(&dpci->hvm_timer[irq_to_vector(i)]); + pirq_guest_eoi(d, i); + } + } + } + } + spin_unlock(&d->evtchn_lock); +} diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h index 227f763..8488da0 100644 --- a/include/asm-x86/acpi.h +++ b/include/asm-x86/acpi.h @@ -178,4 +178,6 @@ extern void acpi_reserve_bootmem(void); extern u8 x86_acpiid_to_apicid[]; #define MAX_LOCAL_APIC 256 +extern int acpi_dmar_init(void); + #endif /*_ASM_ACPI_H*/ diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h index fd28d4a..7a34df4 100644 --- a/include/asm-x86/domain.h +++ b/include/asm-x86/domain.h @@ -222,6 +222,9 @@ struct arch_domain struct hvm_domain hvm_domain; + /* pass-throughed device list */ + struct list_head pdev_list; + struct paging_domain paging; struct p2m_domain p2m ; diff --git a/include/asm-x86/hvm/domain.h b/include/asm-x86/hvm/domain.h index 50802b3..88cc1cd 100644 --- a/include/asm-x86/hvm/domain.h +++ b/include/asm-x86/hvm/domain.h @@ -27,6 +27,7 @@ #include <asm/hvm/io.h> #include <public/hvm/params.h> #include <public/hvm/save.h> +#include <xen/hvm/iommu.h> struct hvm_ioreq_page { spinlock_t lock; @@ -62,6 +63,9 @@ struct hvm_domain { uint64_t params[HVM_NR_PARAMS]; + /* Pass-through */ + struct hvm_iommu hvm_iommu; + #if CONFIG_PAGING_LEVELS == 3 bool_t amd_npt_4gb_warning; #endif diff --git a/include/asm-x86/hvm/irq.h b/include/asm-x86/hvm/irq.h index 7ab4446..b6e78ba 100644 --- a/include/asm-x86/hvm/irq.h +++ b/include/asm-x86/hvm/irq.h @@ -28,6 +28,7 @@ #include <asm/hvm/vpic.h> #include <asm/hvm/vioapic.h> #include <public/hvm/save.h> +#include <xen/hvm/irq.h> struct hvm_irq { @@ -90,6 +91,8 @@ struct hvm_irq { /* Last VCPU that was delivered a LowestPrio interrupt. */ u8 round_robin_prev_vcpu; + + struct hvm_irq_dpci *dpci; }; #define hvm_pci_intx_gsi(dev, intx) \ diff --git a/include/xen/hvm/irq.h b/include/xen/hvm/irq.h new file mode 100644 index 0000000..c0ec1a0 --- /dev/null +++ b/include/xen/hvm/irq.h @@ -0,0 +1,99 @@ +/****************************************************************************** + * irq.h + * + * Interrupt distribution and delivery logic. + * + * Copyright (c) 2006, K A Fraser, XenSource Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __XEN_HVM_IRQ_H__ +#define __XEN_HVM_IRQ_H__ + +#include <xen/types.h> +#include <xen/spinlock.h> +#include <asm/irq.h> +#include <public/hvm/save.h> + +struct dev_intx_gsi_link { + struct list_head list; + uint8_t device; + uint8_t intx; + uint8_t gsi; + uint8_t link; +}; + +#define _HVM_IRQ_DPCI_MSI 0x1 + +struct hvm_gmsi_info { + uint32_t gvec; + uint32_t gflags; +}; + +struct hvm_mirq_dpci_mapping { + uint32_t flags; + int pending; + struct list_head digl_list; + struct domain *dom; + struct hvm_gmsi_info gmsi; +}; + +struct hvm_girq_dpci_mapping { + uint8_t valid; + uint8_t device; + uint8_t intx; + uint8_t machine_gsi; +}; + +#define NR_ISAIRQS 16 +#define NR_LINK 4 + +/* Protected by domain's evtchn_lock */ +struct hvm_irq_dpci { + /* Machine IRQ to guest device/intx mapping. */ + DECLARE_BITMAP(mapping, NR_IRQS); + struct hvm_mirq_dpci_mapping mirq[NR_IRQS]; + /* Guest IRQ to guest device/intx mapping. */ + struct hvm_girq_dpci_mapping girq[NR_IRQS]; + uint8_t msi_gvec_pirq[NR_VECTORS]; + DECLARE_BITMAP(dirq_mask, NR_IRQS); + /* Record of mapped ISA IRQs */ + DECLARE_BITMAP(isairq_map, NR_ISAIRQS); + /* Record of mapped Links */ + uint8_t link_cnt[NR_LINK]; + struct timer hvm_timer[NR_IRQS]; +}; + +/* Modify state of a PCI INTx wire. */ +void hvm_pci_intx_assert( + struct domain *d, unsigned int device, unsigned int intx); +void hvm_pci_intx_deassert( + struct domain *d, unsigned int device, unsigned int intx); + +/* Modify state of an ISA device's IRQ wire. */ +void hvm_isa_irq_assert( + struct domain *d, unsigned int isa_irq); +void hvm_isa_irq_deassert( + struct domain *d, unsigned int isa_irq); + +void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); + +void hvm_maybe_deassert_evtchn_irq(void); +void hvm_assert_evtchn_irq(struct vcpu *v); +void hvm_set_callback_via(struct domain *d, uint64_t via); + +void hvm_dirq_assist(struct vcpu *v); + +#endif /* __XEN_HVM_IRQ_H__ */ diff --git a/include/xen/irq.h b/include/xen/irq.h index f1c1ef4..c145ad9 100644 --- a/include/xen/irq.h +++ b/include/xen/irq.h @@ -63,6 +63,10 @@ extern irq_desc_t irq_desc[NR_IRQS]; extern int setup_irq(unsigned int, struct irqaction *); extern void free_irq(unsigned int); +extern int request_irq(unsigned int irq, + void (*handler)(int, void *, struct cpu_user_regs *), + unsigned long irqflags, const char * devname, + void *dev_id); extern hw_irq_controller no_irq_type; extern void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs); diff --git a/include/xen/pci.h b/include/xen/pci.h index fa01bd2..618c6bf 100644 --- a/include/xen/pci.h +++ b/include/xen/pci.h @@ -30,6 +30,7 @@ #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) struct pci_dev { + struct list_head alldevs_list; struct list_head domain_list; struct domain *domain; const u8 bus; @@ -39,6 +40,25 @@ struct pci_dev { #define for_each_pdev(domain, pdev) \ list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list) +/* + * The pcidevs_lock protect alldevs_list, and the assignment for the + * devices, it also sync the access to the msi capability that is not + * interrupt handling related (the mask bit register). + */ + +extern spinlock_t pcidevs_lock; + +struct pci_dev *alloc_pdev(u8 bus, u8 devfn); +void free_pdev(struct pci_dev *pdev); +struct pci_dev *pci_lock_pdev(int bus, int devfn); +struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn); + +void pci_release_devices(struct domain *d); +int pci_add_device(u8 bus, u8 devfn); +int pci_remove_device(u8 bus, u8 devfn); +struct pci_dev *pci_get_pdev(int bus, int devfn); +struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn); + uint8_t pci_conf_read8( unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); uint16_t pci_conf_read16(