From: Don Dutile <ddutile@redhat.com> Date: Tue, 1 Dec 2009 23:42:46 -0500 Subject: [pci] intel-iommu: add 2.6.32-rc4 sw and hw pass-through Message-id: <4B15A9F6.8090403@redhat.com> Patchwork-id: 21630 O-Subject: [RHEL5.5 PATCH V2] 2/9: INTEL_IOMMU: Backport 2.6.32-rc4 sw and hw pass-through Bugzilla: 516811 518103 RH-Acked-by: Chris Wright <chrisw@redhat.com> RH-Acked-by: Prarit Bhargava <prarit@redhat.com> V2 cleaned up space/tabs/indents and removed duplicate pci_is_root_device that a post rhel5-172 patch added (pci-aer patch?). otherwise, the same. BZ 516727. 0/9 states build & tests. This was a set of over 25 commits, so it seemed ridiculous to list them all. It's primarily this commit with numerous bug fixes & patches: commit 19943b0e30b05d42e494ae6fef78156ebc8c637e Author: David Woodhouse <David.Woodhouse@intel.com> Date: Tue Aug 4 16:19:20 2009 +0100 intel-iommu: Unify hardware and software passthrough support This makes the hardware passthrough mode work a lot more like the software version, so that the behaviour of a kernel with 'iommu=pt' is the same whether the hardware supports passthrough or not. In particular: - We use a single si_domain for the pass-through devices. - 32-bit devices can be taken out of the pass-through domain so that they don't have to use swiotlb. - Devices will work again after being removed from a KVM guest. - A potential oops on OOM (in init_context_pass_through()) is fixed. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> Please review & ACK. - Don >From f56dfb24677d857c5e60f692bed824b8e6311129 Mon Sep 17 00:00:00 2001 From: Donald Dutile <ddutile@redhat.com> Date: Mon, 16 Nov 2009 17:21:32 -0500 Subject: [PATCH 2/9] INTEL_IOMMU: Backport 2.6.32-rc4 sw and hw pass-through diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 96d150e..50b27c3 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -40,6 +40,15 @@ int force_iommu __read_mostly= 0; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA ranslation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; + /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ @@ -160,8 +169,6 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dma_alloc_coherent); -extern int iommu_pass_through; - /* * Unmap coherent memory. * The caller must ensure that the device has finished accessing the mapping. @@ -273,10 +280,8 @@ __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft",4)) swiotlb = 1; - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } #endif #ifdef CONFIG_IOMMU diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 92fdbf0..4e6167a 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -32,8 +32,7 @@ struct dma_mapping_ops swiotlb_dma_ops = { void pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if ((!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) || - iommu_pass_through) + if ((!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)) swiotlb = 1; if (swiotlb_force) swiotlb = 1; diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 0b2ce5b..65513e9 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -560,9 +560,6 @@ int __init dmar_table_init(void) printk(KERN_INFO PREFIX "No ATSR found\n"); #endif -#ifdef CONFIG_INTR_REMAP - parse_ioapics_under_ir(); -#endif return 0; } @@ -621,20 +618,32 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); + if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { + /* Promote an attitude of violence to a BIOS engineer today */ + printk(KERN_WARNING PREFIX + "Your BIOS is broken; DMAR reported at address %llx returns all ones!\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + drhd->reg_base_addr, + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + goto err_unmap; + } + #ifdef CONFIG_DMAR agaw = iommu_calculate_agaw(iommu); if (agaw < 0) { printk(KERN_ERR "Cannot get a valid agaw for iommu (seq_id = %d)\n", iommu->seq_id); - goto error; + goto err_unmap; } msagaw = iommu_calculate_max_sagaw(iommu); if (msagaw < 0) { printk(KERN_ERR "Cannot get a valid max agaw for iommu (seq_id = %d)\n", iommu->seq_id); - goto error; + goto err_unmap; } #endif iommu->agaw = agaw; @@ -654,7 +663,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) } ver = readl(iommu->reg + DMAR_VER_REG); - pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n", + pr_info("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n", (unsigned long long)drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), (unsigned long long)iommu->cap, @@ -664,7 +673,10 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) drhd->iommu = iommu; return 0; -error: + + err_unmap: + iounmap(iommu->reg); + error: kfree(iommu); return -1; } @@ -826,6 +838,7 @@ restart: cpu_relax(); spin_lock(&qi->q_lock); } + qi->desc_status[index] = QI_DONE; reclaim_free_desc(qi); @@ -1086,8 +1099,8 @@ void dmar_msi_unmask(unsigned int irq) void dmar_msi_mask(unsigned int irq) { - struct intel_iommu *iommu = get_irq_data(irq); unsigned long flag; + struct intel_iommu *iommu = get_irq_data(irq); /* mask it */ spin_lock_irqsave(&iommu->register_lock, flag); @@ -1176,7 +1189,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id, struct pt_regs *regs) source_id, guest_addr); fault_index++; - if (fault_index > cap_num_fault_regs(iommu->cap)) + if (fault_index >= cap_num_fault_regs(iommu->cap)) fault_index = 0; spin_lock_irqsave(&iommu->register_lock, flag); } diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 88eb575..7898263 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -37,6 +37,7 @@ #include <linux/iommu.h> #include <linux/intel-iommu.h> #include <linux/sysdev.h> +#include <linux/dmi.h> #include <asm/cacheflush.h> #include <asm/proto.h> #include "pci.h" @@ -55,11 +56,39 @@ #define MAX_AGAW_WIDTH 64 -#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) +#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1) +#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1) + +/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR + to match. That way, we can use 'unsigned long' for PFNs with impunity. */ +#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ + __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) +#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) -#define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK) -#define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK) +#define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32)) +#define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64)) + + +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things + are never going to work. */ +static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) +{ + return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); +} + +static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) +{ + return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ + return mm_to_dma_pfn(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ + return page_to_dma_pfn(virt_to_page(p)); +} /* global iommu list, set NULL for ignored DMAR units */ static struct intel_iommu **g_iommus; @@ -203,9 +232,9 @@ static inline u64 dma_pte_addr(struct dma_pte *pte) return (pte->val & VTD_PAGE_MASK); } -static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr) +static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn) { - pte->val |= (addr & VTD_PAGE_MASK); + pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT; } static inline bool dma_pte_present(struct dma_pte *pte) @@ -213,6 +242,20 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline int first_pte_in_page(struct dma_pte *pte) +{ + return !((unsigned long)pte & ~VTD_PAGE_MASK); +} + +/* + * This domain is a statically identity mapping domain. + * 1. This domain creats a static 1:1 mapping to all usable memory. + * 2. It maps to each iommu if successful. + * 3. Each iommu mapps to this domain if successful. + */ +static struct dmar_domain *si_domain; +static int hw_pass_through = 1; + /* devices under the same p2p bridge are owned in one domain */ #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0) @@ -221,6 +264,9 @@ static inline bool dma_pte_present(struct dma_pte *pte) */ #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1) +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2) + struct dmar_domain { int id; /* domain id */ unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/ @@ -229,7 +275,6 @@ struct dmar_domain { struct iova_domain iovad; /* iova's that belong to this domain */ struct dma_pte *pgd; /* virtual address */ - spinlock_t mapping_lock; /* page table lock */ int gaw; /* max guest address width */ /* adjusted guest address width, 0 is level 2 30-bit */ @@ -289,7 +334,6 @@ int dmar_disabled = 1; static int __initdata dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; -int iommu_pass_through; #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) static DEFINE_SPINLOCK(device_domain_lock); @@ -432,12 +476,14 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); } -/* in native case, each domain is related to only one iommu */ +/* This functionin only returns single iommu in a domain */ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) { int iommu_id; + /* si_domain and vm domain should not get here. */ BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE); + BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY); iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); if (iommu_id < 0 || iommu_id >= g_num_of_iommus) @@ -633,80 +679,78 @@ static inline int width_to_agaw(int width) static inline unsigned int level_to_offset_bits(int level) { - return (12 + (level - 1) * LEVEL_STRIDE); + return (level - 1) * LEVEL_STRIDE; } -static inline int address_level_offset(u64 addr, int level) +static inline int pfn_level_offset(unsigned long pfn, int level) { - return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK); + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) +static inline unsigned long level_mask(int level) { - return ((u64)-1 << level_to_offset_bits(level)); + return -1UL << level_to_offset_bits(level); } -static inline u64 level_size(int level) +static inline unsigned long level_size(int level) { - return ((u64)1 << level_to_offset_bits(level)); + return 1UL << level_to_offset_bits(level); } -static inline u64 align_to_level(u64 addr, int level) +static inline unsigned long align_to_level(unsigned long pfn, int level) { - return ((addr + level_size(level) - 1) & level_mask(level)); + return (pfn + level_size(level) - 1) & level_mask(level); } -static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) +static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, + unsigned long pfn) { - int addr_width = agaw_to_width(domain->agaw); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); int offset; - unsigned long flags; BUG_ON(!domain->pgd); - - addr &= (((u64)1) << addr_width) - 1; + BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; - spin_lock_irqsave(&domain->mapping_lock, flags); while (level > 0) { void *tmp_page; - offset = address_level_offset(addr, level); + offset = pfn_level_offset(pfn, level); pte = &parent[offset]; if (level == 1) break; if (!dma_pte_present(pte)) { + uint64_t pteval; + tmp_page = alloc_pgtable_page(); - if (!tmp_page) { - spin_unlock_irqrestore(&domain->mapping_lock, - flags); + if (!tmp_page) return NULL; + + domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); + pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; + if (cmpxchg(&pte->val, 0ULL, pteval)) { + /* Someone else set it while we were thinking; use theirs. */ + free_pgtable_page(tmp_page); + } else { + dma_pte_addr(pte); + domain_flush_cache(domain, pte, sizeof(*pte)); } - domain_flush_cache(domain, tmp_page, PAGE_SIZE); - dma_set_pte_addr(pte, virt_to_phys(tmp_page)); - /* - * high level table always sets r/w, last level page - * table control read/write - */ - dma_set_pte_readable(pte); - dma_set_pte_writable(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); } parent = phys_to_virt(dma_pte_addr(pte)); level--; } - spin_unlock_irqrestore(&domain->mapping_lock, flags); return pte; } /* return address's pte at specific level */ -static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, - int level) +static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, + unsigned long pfn, + int level) { struct dma_pte *parent, *pte = NULL; int total = agaw_to_level(domain->agaw); @@ -714,7 +758,7 @@ static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, parent = domain->pgd; while (level <= total) { - offset = address_level_offset(addr, total); + offset = pfn_level_offset(pfn, total); pte = &parent[offset]; if (level == total) return pte; @@ -727,74 +771,85 @@ static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, return NULL; } -/* clear one page's page table */ -static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) -{ - struct dma_pte *pte = NULL; - - /* get last level pte */ - pte = dma_addr_level_pte(domain, addr, 1); - - if (pte) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - } -} - /* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) -{ - int addr_width = agaw_to_width(domain->agaw); - int npages; +static void dma_pte_clear_range(struct dmar_domain *domain, + unsigned long start_pfn, + unsigned long last_pfn) +{ + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + struct dma_pte *first_pte, *pte; + + BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); + BUG_ON(start_pfn > last_pfn); + + /* we don't need lock here; nobody else touches the iova range */ + do { + first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1); + if (!pte) { + start_pfn = align_to_level(start_pfn + 1, 2); + continue; + } + do { + dma_clear_pte(pte); + start_pfn++; + pte++; + } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); - start &= (((u64)1) << addr_width) - 1; - end &= (((u64)1) << addr_width) - 1; - /* in case it's partial page */ - start &= PAGE_MASK; - end = PAGE_ALIGN(end); - npages = (end - start) / VTD_PAGE_SIZE; + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); - /* we don't need lock here, nobody else touches the iova range */ - while (npages--) { - dma_pte_clear_one(domain, start); - start += VTD_PAGE_SIZE; - } + } while (start_pfn && start_pfn <= last_pfn); } /* free page table pages. last level pte should already be cleared */ static void dma_pte_free_pagetable(struct dmar_domain *domain, - u64 start, u64 end) + unsigned long start_pfn, + unsigned long last_pfn) { - int addr_width = agaw_to_width(domain->agaw); - struct dma_pte *pte; + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + struct dma_pte *first_pte, *pte; int total = agaw_to_level(domain->agaw); int level; - u64 tmp; + unsigned long tmp; - start &= (((u64)1) << addr_width) - 1; - end &= (((u64)1) << addr_width) - 1; + BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); + BUG_ON(start_pfn > last_pfn); - /* we don't need lock here, nobody else touches the iova range */ + /* We don't need lock here; nobody else touches the iova range */ level = 2; while (level <= total) { - tmp = align_to_level(start, level); - if (tmp >= end || (tmp + level_size(level) > end)) + tmp = align_to_level(start_pfn, level); + + /* If we can't even clear one PTE at this level, we're done */ + if (tmp + level_size(level) - 1 > last_pfn) return; - while (tmp < end) { - pte = dma_addr_level_pte(domain, tmp, level); - if (pte) { - free_pgtable_page( - phys_to_virt(dma_pte_addr(pte))); - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); + do { + first_pte = pte = dma_pfn_level_pte(domain, tmp, level); + if (!pte) { + tmp = align_to_level(tmp + 1, level + 1); + continue; } - tmp += level_size(level); - } + do { + if (dma_pte_present(pte)) { + free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); + dma_clear_pte(pte); + } + pte++; + tmp += level_size(level); + } while (!first_pte_in_page(pte) && + tmp + level_size(level) - 1 <= last_pfn); + + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + + } while (tmp && tmp + level_size(level) - 1 <= last_pfn); level++; } /* free pgd */ - if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) { + if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { free_pgtable_page(domain->pgd); domain->pgd = NULL; } @@ -834,7 +889,7 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_RTPS), sts); + readl, (sts & DMA_GSTS_RTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); } @@ -893,7 +948,7 @@ static void __iommu_flush_context(struct intel_iommu *iommu, /* return value determine if we need a write buffer flush */ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type) + u64 addr, unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; @@ -941,7 +996,7 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, /* check IOTLB invalidation granularity */ if (DMA_TLB_IAIG(val) == 0) - printk(KERN_ERR"IOMMU: flush IOTLB failed\n"); + printk(KERN_ERR "IOMMU: flush IOTLB failed\n"); if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", (unsigned long long)DMA_TLB_IIRG(type), @@ -1020,14 +1075,14 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, } static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int pages) + unsigned long pfn, unsigned int pages) { unsigned int mask = ilog2(__roundup_pow_of_two(pages)); + uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; - BUG_ON(addr & (~VTD_PAGE_MASK)); BUG_ON(pages == 0); - /* + /* * Fallback to domain selective flush if no PSI support or the size is * too big. * PSI requires page size to be 2 ^ x, and the base address is naturally @@ -1035,12 +1090,16 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, */ if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); + DMA_TLB_DSI_FLUSH); else iommu->flush.flush_iotlb(iommu, did, addr, mask, - DMA_TLB_DSI_FLUSH); + DMA_TLB_PSI_FLUSH); - if (did) + /* + * In caching mode, domain ID 0 is reserved for non-present to present + * mapping flush. Device IOTLB doesn't need to be flushed in this case. + */ + if (!cap_caching_mode(iommu->cap) || did) iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); } @@ -1068,7 +1127,7 @@ static int iommu_enable_translation(struct intel_iommu *iommu) spin_lock_irqsave(&iommu->register_lock, flags); iommu->gcmd |= DMA_GCMD_TE; - writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, @@ -1105,6 +1164,8 @@ static int iommu_init_domains(struct intel_iommu *iommu) pr_debug("Number of Domains supportd <%ld>\n", ndomains); nlongs = BITS_TO_LONGS(ndomains); + spin_lock_init(&iommu->lock); + /* TBD: there might be 64K domains, * consider other allocation for future chip */ @@ -1117,12 +1178,9 @@ static int iommu_init_domains(struct intel_iommu *iommu) GFP_KERNEL); if (!iommu->domains) { printk(KERN_ERR "Allocating domain array failed\n"); - kfree(iommu->domain_ids); return -ENOMEM; } - spin_lock_init(&iommu->lock); - /* * if Caching mode is set, then invalid translations are tagged * with domainid 0. Hence we need to pre-allocate it. @@ -1142,22 +1200,24 @@ void free_dmar_iommu(struct intel_iommu *iommu) int i; unsigned long flags; - i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap)); - for (; i < cap_ndoms(iommu->cap); ) { - domain = iommu->domains[i]; - clear_bit(i, iommu->domain_ids); + if ((iommu->domains) && (iommu->domain_ids)) { + i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap)); + for (; i < cap_ndoms(iommu->cap); ) { + domain = iommu->domains[i]; + clear_bit(i, iommu->domain_ids); + + spin_lock_irqsave(&domain->iommu_lock, flags); + if (--domain->iommu_count == 0) { + if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) + vm_domain_exit(domain); + else + domain_exit(domain); + } + spin_unlock_irqrestore(&domain->iommu_lock, flags); - spin_lock_irqsave(&domain->iommu_lock, flags); - if (--domain->iommu_count == 0) { - if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) - vm_domain_exit(domain); - else - domain_exit(domain); + i = find_next_bit(iommu->domain_ids, + cap_ndoms(iommu->cap), i+1); } - spin_unlock_irqrestore(&domain->iommu_lock, flags); - - i = find_next_bit(iommu->domain_ids, - cap_ndoms(iommu->cap), i+1); } if (iommu->gcmd & DMA_GCMD_TE) @@ -1188,53 +1248,75 @@ void free_dmar_iommu(struct intel_iommu *iommu) free_context_table(iommu); } -static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) +static struct dmar_domain *alloc_domain(void) { - unsigned long num; - unsigned long ndomains; struct dmar_domain *domain; - unsigned long flags; domain = alloc_domain_mem(); if (!domain) return NULL; + memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); + domain->flags = 0; + + return domain; +} + +static int iommu_attach_domain(struct dmar_domain *domain, + struct intel_iommu *iommu) +{ + int num; + unsigned long ndomains; + unsigned long flags; + ndomains = cap_ndoms(iommu->cap); spin_lock_irqsave(&iommu->lock, flags); + num = find_first_zero_bit(iommu->domain_ids, ndomains); if (num >= ndomains) { spin_unlock_irqrestore(&iommu->lock, flags); - free_domain_mem(domain); printk(KERN_ERR "IOMMU: no free domain ids\n"); - return NULL; + return -ENOMEM; } - set_bit(num, iommu->domain_ids); domain->id = num; - memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); + set_bit(num, iommu->domain_ids); set_bit(iommu->seq_id, &domain->iommu_bmp); - domain->flags = 0; iommu->domains[num] = domain; spin_unlock_irqrestore(&iommu->lock, flags); - return domain; + return 0; } -static void iommu_free_domain(struct dmar_domain *domain) +static void iommu_detach_domain(struct dmar_domain *domain, + struct intel_iommu *iommu) { unsigned long flags; - struct intel_iommu *iommu; - - iommu = domain_get_iommu(domain); + int num, ndomains; + int found = 0; spin_lock_irqsave(&iommu->lock, flags); - clear_bit(domain->id, iommu->domain_ids); + ndomains = cap_ndoms(iommu->cap); + num = find_first_bit(iommu->domain_ids, ndomains); + for (; num < ndomains; ) { + if (iommu->domains[num] == domain) { + found = 1; + break; + } + num = find_next_bit(iommu->domain_ids, + cap_ndoms(iommu->cap), num+1); + } + + if (found) { + clear_bit(num, iommu->domain_ids); + clear_bit(iommu->seq_id, &domain->iommu_bmp); + iommu->domains[num] = NULL; + } spin_unlock_irqrestore(&iommu->lock, flags); } static struct iova_domain reserved_iova_list; -static struct lock_class_key reserved_alloc_key; static struct lock_class_key reserved_rbtree_key; static void dmar_init_reserved_ranges(void) @@ -1242,12 +1324,9 @@ static void dmar_init_reserved_ranges(void) struct pci_dev *pdev = NULL; struct iova *iova; int i; - u64 addr, size; init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN); - lockdep_set_class(&reserved_iova_list.iova_alloc_lock, - &reserved_alloc_key); lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, &reserved_rbtree_key); @@ -1265,12 +1344,9 @@ static void dmar_init_reserved_ranges(void) r = &pdev->resource[i]; if (!r->flags || !(r->flags & IORESOURCE_MEM)) continue; - addr = r->start; - addr &= PAGE_MASK; - size = r->end - addr; - size = PAGE_ALIGN(size); - iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr), - IOVA_PFN(size + addr) - 1); + iova = reserve_iova(&reserved_iova_list, + IOVA_PFN(r->start), + IOVA_PFN(r->end)); if (!iova) printk(KERN_ERR "Reserve iova failed\n"); } @@ -1304,7 +1380,6 @@ static int domain_init(struct dmar_domain *domain, int guest_width) unsigned long sagaw; init_iova_domain(&domain->iovad, DMA_32BIT_PFN); - spin_lock_init(&domain->mapping_lock); spin_lock_init(&domain->iommu_lock); domain_reserve_special_ranges(domain); @@ -1349,7 +1424,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width) static void domain_exit(struct dmar_domain *domain) { - u64 end; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; /* Domain 0 is reserved, so dont process it */ if (!domain) @@ -1358,16 +1434,17 @@ static void domain_exit(struct dmar_domain *domain) domain_remove_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); - end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~PAGE_MASK); /* clear ptes */ - dma_pte_clear_range(domain, 0, end); + dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ - dma_pte_free_pagetable(domain, 0, end); + dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + + for_each_active_iommu(iommu, drhd) + if (test_bit(iommu->seq_id, &domain->iommu_bmp)) + iommu_detach_domain(domain, iommu); - iommu_free_domain(domain); free_domain_mem(domain); } @@ -1407,7 +1484,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, id = domain->id; pgd = domain->pgd; - if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) { + if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || + domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) { int found = 0; /* find an available domain id for this device in iommu */ @@ -1455,12 +1533,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, translation = info ? CONTEXT_TT_DEV_IOTLB : CONTEXT_TT_MULTI_LEVEL; } - /* * In pass through mode, AW must be programmed to indicate the largest * AGAW value supported by hardware. And ASR is ignored by hardware. */ - if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) + if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) context_set_address_width(context, iommu->msagaw); else { context_set_address_root(context, virt_to_phys(pgd)); @@ -1487,7 +1564,6 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment, } else { iommu_flush_write_buffer(iommu); } - iommu_enable_dev_iotlb(info); spin_unlock_irqrestore(&iommu->lock, flags); @@ -1576,42 +1652,99 @@ static int domain_context_mapped(struct pci_dev *pdev) tmp->devfn); } -static int -domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, - u64 hpa, size_t size, int prot) +/* Returns a number of VTD pages, but aligned to MM page size */ +static inline unsigned long aligned_nrpages(unsigned long host_addr, + size_t size) { - u64 start_pfn, end_pfn; - struct dma_pte *pte; - int index; - int addr_width = agaw_to_width(domain->agaw); + host_addr &= ~PAGE_MASK; + return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; +} - hpa &= (((u64)1) << addr_width) - 1; +static inline struct page *sg_page(struct scatterlist *sg) +{ + return sg->page; +} + +static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long phys_pfn, + unsigned long nr_pages, int prot) +{ + struct dma_pte *first_pte = NULL, *pte = NULL; + phys_addr_t uninitialized_var(pteval); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + unsigned long sg_res; + + BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; - iova &= PAGE_MASK; - start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT; - end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT; - index = 0; - while (start_pfn < end_pfn) { - pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index); - if (!pte) - return -ENOMEM; + + prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + + if (sg) + sg_res = 0; + else { + sg_res = nr_pages + 1; + pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; + } + + while (nr_pages--) { + uint64_t tmp; + + if (!sg_res) { + sg_res = aligned_nrpages(sg->offset, sg->length); + sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; + sg->dma_length = sg->length; + pteval = page_to_phys(sg_page(sg)) | prot; + } + if (!pte) { + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); + if (!pte) + return -ENOMEM; + } /* We don't need lock here, nobody else * touches the iova range */ - BUG_ON(dma_pte_addr(pte)); - dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT); - dma_set_pte_prot(pte, prot); - if (prot & DMA_PTE_SNP) - dma_set_pte_snp(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - start_pfn++; - index++; + tmp = cmpxchg(&pte->val, 0ULL, pteval); + if (tmp) { + static int dumps = 5; + printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", + iov_pfn, tmp, (unsigned long long)pteval); + if (dumps) { + dumps--; + debug_dma_dump_mappings(NULL); + } + WARN_ON_ONCE(1); + } + pte++; + if (!nr_pages || first_pte_in_page(pte)) { + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + pte = NULL; + } + iov_pfn++; + pteval += VTD_PAGE_SIZE; + sg_res--; + if (!sg_res) + sg++; } return 0; } +static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long nr_pages, + int prot) +{ + return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); +} + +static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + unsigned long phys_pfn, unsigned long nr_pages, + int prot) +{ + return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); +} + static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) { if (!iommu) @@ -1676,6 +1809,7 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) unsigned long flags; int bus = 0, devfn = 0; int segment; + int ret; domain = find_domain(pdev); if (domain) @@ -1708,6 +1842,10 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) } } + domain = alloc_domain(); + if (!domain) + goto error; + /* Allocate new domain for the device */ drhd = dmar_find_matched_drhd_unit(pdev); if (!drhd) { @@ -1717,9 +1855,11 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) } iommu = drhd->iommu; - domain = iommu_alloc_domain(iommu); - if (!domain) + ret = iommu_attach_domain(domain, iommu); + if (ret) { + domain_exit(domain); goto error; + } if (domain_init(domain, gaw)) { domain_exit(domain); @@ -1793,55 +1933,85 @@ error: return find_domain(pdev); } +static int iommu_identity_mapping; + +static int iommu_domain_identity_map(struct dmar_domain *domain, + unsigned long long start, + unsigned long long end) +{ + unsigned long first_vpfn = start >> VTD_PAGE_SHIFT; + unsigned long last_vpfn = end >> VTD_PAGE_SHIFT; + + if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn), + dma_to_mm_pfn(last_vpfn))) { + printk(KERN_ERR "IOMMU: reserve iova failed\n"); + return -ENOMEM; + } + + pr_debug("Mapping reserved region %llx-%llx for domain %d\n", + start, end, domain->id); + /* + * RMRR range might have overlap with physical memory range, + * clear it first + */ + dma_pte_clear_range(domain, first_vpfn, last_vpfn); + + return domain_pfn_mapping(domain, first_vpfn, first_vpfn, + last_vpfn - first_vpfn + 1, + DMA_PTE_READ|DMA_PTE_WRITE); +} + static int iommu_prepare_identity_map(struct pci_dev *pdev, unsigned long long start, unsigned long long end) { struct dmar_domain *domain; - unsigned long size; - unsigned long long base; int ret; - printk(KERN_INFO - "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", - pci_name(pdev), start, end); - /* page table init */ domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; - /* The address might not be aligned */ - base = start & PAGE_MASK; - size = end - base; - size = PAGE_ALIGN(size); - if (!reserve_iova(&domain->iovad, IOVA_PFN(base), - IOVA_PFN(base + size) - 1)) { - printk(KERN_ERR "IOMMU: reserve iova failed\n"); - ret = -ENOMEM; - goto error; + /* For _hardware_ passthrough, don't bother. But for software + passthrough, we do it anyway -- it may indicate a memory + range which is reserved in E820, so which didn't get set + up to start with in si_domain */ + if (domain == si_domain && hw_pass_through) { + printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + return 0; } - pr_debug("Mapping reserved region %lx@%llx for %s\n", - size, base, pci_name(pdev)); - /* - * RMRR range might have overlap with physical memory range, - * clear it first - */ - dma_pte_clear_range(domain, base, base + size); + printk(KERN_INFO + "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + + if (end >> agaw_to_width(domain->agaw)) { + printk(KERN_WARNING "IOMMU: Your BIOS is broken;" + "RMRR exceeds permitted address width (%d bits)\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + agaw_to_width(domain->agaw), + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + ret = -EIO; + goto error; + } - ret = domain_page_mapping(domain, base, base, size, - DMA_PTE_READ|DMA_PTE_WRITE); + ret = iommu_domain_identity_map(domain, start, end); if (ret) goto error; /* context entry init */ ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL); - if (!ret) - return 0; -error: + if (ret) + goto error; + + return 0; + + error: domain_exit(domain); return ret; - } static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, @@ -1853,124 +2023,218 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, rmrr->end_address + 1); } -#ifdef CONFIG_DMAR_GFX_WA -struct iommu_prepare_data { +#ifdef CONFIG_DMAR_FLOPPY_WA +static inline void iommu_prepare_isa(void) +{ struct pci_dev *pdev; int ret; -}; -static int __init iommu_prepare_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - struct iommu_prepare_data *data; + pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); + if (!pdev) + return; - data = (struct iommu_prepare_data *)datax; + printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n"); + ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); - data->ret = iommu_prepare_identity_map(data->pdev, - start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); - return data->ret; + if (ret) + printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; " + "floppy might not work\n"); } +#else +static inline void iommu_prepare_isa(void) +{ + return; +} +#endif /* !CONFIG_DMAR_FLPY_WA */ -static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +static int md_domain_init(struct dmar_domain *domain, int guest_width); + +static int __init si_domain_work_fn(unsigned long start_pfn, + unsigned long end_pfn) { - struct iommu_prepare_data data; - struct zone *zone; + return iommu_domain_identity_map(si_domain, + (uint64_t)start_pfn << PAGE_SHIFT, + (uint64_t)end_pfn << PAGE_SHIFT); +} - data.pdev = pdev; - data.ret = 0; +static int __init iommu_prepare_with_active_regions(void) +{ + struct zone *zone; + int ret; - for_each_zone(zone) { - unsigned long start_pfn, end_pfn; + for_each_zone(zone) { + unsigned long start_pfn, end_pfn; - start_pfn = zone->zone_start_pfn; - end_pfn = start_pfn + zone->spanned_pages; + start_pfn = zone->zone_start_pfn; + end_pfn = start_pfn + zone->spanned_pages; - iommu_prepare_work_fn(start_pfn, end_pfn, &data); - if (data.ret) - return data.ret; - } - return data.ret; + ret = si_domain_work_fn(start_pfn, end_pfn); + if (ret) + return ret; + } + return ret; } -static void __init iommu_prepare_gfx_mapping(void) +static int __init si_domain_init(int hw) { - struct pci_dev *pdev = NULL; - int ret; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int ret = 0; - for_each_pci_dev(pdev) { - if (pdev->iommu == DUMMY_DEVICE_DOMAIN_INFO || - !IS_GFX_DEVICE(pdev)) - continue; - printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", - pci_name(pdev)); - ret = iommu_prepare_with_active_regions(pdev); - if (ret) - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + si_domain = alloc_domain(); + if (!si_domain) + return -EFAULT; + + pr_debug("Identity mapping domain is domain %d\n", si_domain->id); + + for_each_active_iommu(iommu, drhd) { + ret = iommu_attach_domain(si_domain, iommu); + if (ret) { + domain_exit(si_domain); + return -EFAULT; + } } + + if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + domain_exit(si_domain); + return -EFAULT; + } + + si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + + if (hw) + return 0; + + ret = iommu_prepare_with_active_regions(); + if (ret) + return ret; + + return 0; } -#else /* !CONFIG_DMAR_GFX_WA */ -static inline void iommu_prepare_gfx_mapping(void) + +static void domain_remove_one_dev_info(struct dmar_domain *domain, + struct pci_dev *pdev); +static int identity_mapping(struct pci_dev *pdev) { - return; + struct device_domain_info *info; + + if (likely(!iommu_identity_mapping)) + return 0; + + + list_for_each_entry(info, &si_domain->devices, link) + if (info->dev == pdev) + return 1; + return 0; } -#endif -#ifdef CONFIG_DMAR_FLOPPY_WA -static inline void iommu_prepare_isa(void) +static int domain_add_dev_info(struct dmar_domain *domain, + struct pci_dev *pdev, + int translation) { - struct pci_dev *pdev; + struct device_domain_info *info; + unsigned long flags; int ret; - pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); - if (!pdev) - return; + info = alloc_devinfo_mem(); + if (!info) + return -ENOMEM; - printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n"); - ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); + ret = domain_context_mapping(domain, pdev, translation); + if (ret) { + free_devinfo_mem(info); + return ret; + } - if (ret) - printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, " - "floppy might not work\n"); + info->segment = pci_domain_nr(pdev->bus); + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->dev = pdev; + info->domain = domain; + spin_lock_irqsave(&device_domain_lock, flags); + list_add(&info->link, &domain->devices); + list_add(&info->global, &device_domain_list); + pdev->iommu = info; + spin_unlock_irqrestore(&device_domain_lock, flags); + + return 0; } -#else -static inline void iommu_prepare_isa(void) + +static int iommu_should_identity_map(struct pci_dev *pdev, int startup) { - return; + if (iommu_identity_mapping == 2) + return IS_GFX_DEVICE(pdev); + + /* + * We want to start off with all devices in the 1:1 domain, and + * take them out later if we find they can't access all of memory. + * + * However, we can't do this for PCI devices behind bridges, + * because all PCI devices behind the same bridge will end up + * with the same source-id on their transactions. + * + * Practically speaking, we can't change things around for these + * devices at run-time, because we can't be sure there'll be no + * DMA transactions in flight for any of their siblings. + * + * So PCI devices (unless they're on the root bus) as well as + * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of + * the 1:1 domain, just in _case_ one of their siblings turns out + * not to be able to map all of memory. + */ + if (!pdev->is_pcie) { + if (!pci_is_root_bus(pdev->bus)) + return 0; + if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI) + return 0; + } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + return 0; + + /* + * At boot time, we don't yet know if devices will be 64-bit capable. + * Assume that they will -- if they turn out not to be, then we can + * take them out of the 1:1 domain later. + */ + if (!startup) + return pdev->dma_mask > DMA_BIT_MASK(32); + + return 1; } -#endif /* !CONFIG_DMAR_FLPY_WA */ -/* Initialize each context entry as pass through.*/ -static int __init init_context_pass_through(void) +static int __init iommu_prepare_static_identity_mapping(int hw) { struct pci_dev *pdev = NULL; - struct dmar_domain *domain; int ret; + ret = si_domain_init(hw); + if (ret) + return -EFAULT; + for_each_pci_dev(pdev) { - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); - if (!domain) { - printk(KERN_ERR - "Allocating pass-thru domain for %s failed", pci_name(pdev)); - return -ENODEV; + if (iommu_should_identity_map(pdev, 1)) { + printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n", + hw ? "hardware" : "software", pci_name(pdev)); + + ret = domain_add_dev_info(si_domain, pdev, + hw ? CONTEXT_TT_PASS_THROUGH : + CONTEXT_TT_MULTI_LEVEL); + if (ret) + return ret; } - ret = domain_context_mapping(domain, pdev, - CONTEXT_TT_PASS_THROUGH); - if (ret) - return ret; } + return 0; } -static int __init init_dmars(void) +int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct dmar_rmrr_unit *rmrr; struct pci_dev *pdev; struct intel_iommu *iommu; int i, ret; - int pass_through = 1; /* * for each drhd @@ -1998,7 +2262,6 @@ static int __init init_dmars(void) deferred_flush = kzalloc(g_num_of_iommus * sizeof(struct deferred_flush_tables), GFP_KERNEL); if (!deferred_flush) { - kfree(g_iommus); ret = -ENOMEM; goto error; } @@ -2025,14 +2288,8 @@ static int __init init_dmars(void) goto error; } if (!ecap_pass_through(iommu->ecap)) - pass_through = 0; + hw_pass_through = 0; } - if (iommu_pass_through) - if (!pass_through) { - printk(KERN_INFO - "Pass Through is not supported by hardware.\n"); - iommu_pass_through = 0; - } /* * Start from the sane iommu hardware state. @@ -2088,66 +2345,57 @@ static int __init init_dmars(void) } } -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) { - ret = enable_intr_remapping(0); - if (ret) - printk(KERN_ERR - "IOMMU: enable interrupt remapping failed\n"); - } + if (iommu_pass_through) + iommu_identity_mapping = 1; +#ifdef CONFIG_DMAR_BROKEN_GFX_WA + else + iommu_identity_mapping = 2; #endif /* - * If pass through is set and enabled, context entries of all pci - * devices are intialized by pass through translation type. + * If pass through is not set or not enabled, setup context entries for + * identity mappings for rmrr, gfx, and isa and may fall back to static + * identity mapping if iommu_identity_mapping is set. */ - if (iommu_pass_through) { - ret = init_context_pass_through(); + if (iommu_identity_mapping) { + ret = iommu_prepare_static_identity_mapping(hw_pass_through); if (ret) { - printk(KERN_ERR "IOMMU: Pass through init failed.\n"); - iommu_pass_through = 0; + printk(KERN_CRIT "Failed to setup IOMMU pass-through\n"); + goto error; } } - /* - * If pass through is not set or not enabled, setup context entries for - * identity mappings for rmrr, gfx, and isa. + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor */ - if (!iommu_pass_through) { - /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor - */ - for_each_rmrr_units(rmrr) { - for (i = 0; i < rmrr->devices_cnt; i++) { - pdev = rmrr->devices[i]; - /* - * some BIOS lists non-exist devices in DMAR - * table. - */ - if (!pdev) - continue; - ret = iommu_prepare_rmrr_dev(rmrr, pdev); - if (ret) - printk(KERN_ERR - "IOMMU: mapping reserved region failed\n"); - } + printk(KERN_INFO "IOMMU: Setting RMRR:\n"); + for_each_rmrr_units(rmrr) { + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* + * some BIOS lists non-exist devices in DMAR + * table. + */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR + "IOMMU: mapping reserved region failed\n"); } - - iommu_prepare_gfx_mapping(); - - iommu_prepare_isa(); } + iommu_prepare_isa(); + /* * for each drhd * enable fault log @@ -2170,11 +2418,12 @@ static int __init init_dmars(void) iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); - iommu_disable_protect_mem_regions(iommu); ret = iommu_enable_translation(iommu); if (ret) goto error; + + iommu_disable_protect_mem_regions(iommu); } return 0; @@ -2189,58 +2438,39 @@ error: return ret; } -static inline u64 aligned_size(u64 host_addr, size_t size) -{ - u64 addr; - addr = (host_addr & (~PAGE_MASK)) + size; - return PAGE_ALIGN(addr); -} - -struct iova * -iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end) -{ - struct iova *piova; - - /* Make sure it's in range */ - end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end); - if (!size || (IOVA_START_ADDR + size > end)) - return NULL; - - piova = alloc_iova(&domain->iovad, - size >> PAGE_SHIFT, IOVA_PFN(end), 1); - return piova; -} - -static struct iova * -__intel_alloc_iova(struct device *dev, struct dmar_domain *domain, - size_t size, u64 dma_mask) +/* This takes a number of _MM_ pages, not VTD pages */ +static struct iova *intel_alloc_iova(struct device *dev, + struct dmar_domain *domain, + unsigned long nrpages, uint64_t dma_mask) { struct pci_dev *pdev = to_pci_dev(dev); struct iova *iova = NULL; - if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac) - iova = iommu_alloc_iova(domain, size, dma_mask); - else { + /* Restrict dma_mask to the width that the iommu can handle */ + dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask); + + if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { /* * First try to allocate an io virtual address in - * DMA_32BIT_MASK and if that fails then try allocating + * DMA_BIT_MASK(32) and if that fails then try allocating * from higher range */ - iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK); - if (!iova) - iova = iommu_alloc_iova(domain, size, dma_mask); - } - - if (!iova) { - printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev)); + iova = alloc_iova(&domain->iovad, nrpages, + IOVA_PFN(DMA_BIT_MASK(32)), 1); + if (iova) + return iova; + } + iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1); + if (unlikely(!iova)) { + printk(KERN_ERR "Allocating %ld-page iova for %s failed", + nrpages, pci_name(pdev)); return NULL; } return iova; } -static struct dmar_domain * -get_valid_domain_for_dev(struct pci_dev *pdev) +static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev) { struct dmar_domain *domain; int ret; @@ -2268,6 +2498,75 @@ get_valid_domain_for_dev(struct pci_dev *pdev) return domain; } +static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev) +{ + struct device_domain_info *info; + + /* No lock here, assumes no domain exit in normal case */ + info = dev->iommu; + if (likely(info)) + return info->domain; + + return __get_valid_domain_for_dev(dev); +} + +static int iommu_dummy(struct pci_dev *pdev) +{ + return pdev->iommu == DUMMY_DEVICE_DOMAIN_INFO; +} + +/* Check if the pdev needs to go through non-identity map and unmap process.*/ +static int iommu_no_mapping(struct device *dev) +{ + struct pci_dev *pdev; + int found; + + if (unlikely(dev->bus != &pci_bus_type)) + return 1; + + pdev = to_pci_dev(dev); + if (iommu_dummy(pdev)) + return 1; + + if (!iommu_identity_mapping) + return 0; + + found = identity_mapping(pdev); + if (found) { + if (iommu_should_identity_map(pdev, 0)) + return 1; + else { + /* + * 32 bit DMA is removed from si_domain and fall back + * to non-identity mapping. + */ + domain_remove_one_dev_info(si_domain, pdev); + printk(KERN_INFO "32bit %s uses non-identity mapping\n", + pci_name(pdev)); + return 0; + } + } else { + /* + * In case of a detached 64 bit DMA device from vm, the device + * is put into si_domain for identity mapping. + */ + if (iommu_should_identity_map(pdev, 0)) { + int ret; + ret = domain_add_dev_info(si_domain, pdev, + hw_pass_through ? + CONTEXT_TT_PASS_THROUGH : + CONTEXT_TT_MULTI_LEVEL); + if (!ret) { + printk(KERN_INFO "64bit %s uses identity mapping\n", + pci_name(pdev)); + return 1; + } + } + } + + return 0; +} + static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir, u64 dma_mask) { @@ -2278,9 +2577,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, int prot = 0; int ret; struct intel_iommu *iommu; + unsigned long paddr_pfn = paddr >> PAGE_SHIFT; BUG_ON(dir == DMA_NONE); - if (pdev->iommu == DUMMY_DEVICE_DOMAIN_INFO) + + if (iommu_no_mapping(hwdev)) return paddr; domain = get_valid_domain_for_dev(pdev); @@ -2288,14 +2589,13 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, return 0; iommu = domain_get_iommu(domain); - size = aligned_size((u64)paddr, size); + size = aligned_nrpages(paddr, size); - iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), + pdev->dma_mask); if (!iova) goto error; - start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; - /* * Check if DMAR supports zero-length reads on write only * mappings.. @@ -2311,19 +2611,20 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, * might have two guest_addr mapping to the same host paddr, but this * is not a big problem */ - ret = domain_page_mapping(domain, start_paddr, - ((u64)paddr) & PAGE_MASK, size, prot); + ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo), + mm_to_dma_pfn(paddr_pfn), size, prot); if (ret) goto error; - /* it's a non-present to present mapping nly flush if caching mode */ + /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) - iommu_flush_iotlb_psi(iommu, 0, start_paddr, - size >> VTD_PAGE_SHIFT); + iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size); else iommu_flush_write_buffer(iommu); - return start_paddr + ((u64)paddr & (~PAGE_MASK)); + start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; + start_paddr += paddr & ~PAGE_MASK; + return start_paddr; error: if (iova) @@ -2358,8 +2659,13 @@ static void flush_unmaps(void) iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); for (j = 0; j < deferred_flush[i].next; j++) { - __free_iova(&deferred_flush[i].domain[j]->iovad, - deferred_flush[i].iova[j]); + unsigned long mask; + struct iova *iova = deferred_flush[i].iova[j]; + + mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1)); + iommu_flush_dev_iotlb(deferred_flush[i].domain[j], + (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask); + __free_iova(&deferred_flush[i].domain[j]->iovad, iova); } deferred_flush[i].next = 0; } @@ -2402,39 +2708,48 @@ static void add_unmap(struct dmar_domain *dom, struct iova *iova) spin_unlock_irqrestore(&async_umap_flush_lock, flags); } +static int warn_once_unmap_single; static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, int dir) { struct pci_dev *pdev = to_pci_dev(dev); struct dmar_domain *domain; - unsigned long start_addr; + unsigned long start_pfn, last_pfn; struct iova *iova; struct intel_iommu *iommu; - if (pdev->iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(dev)) return; + domain = find_domain(pdev); BUG_ON(!domain); iommu = domain_get_iommu(domain); iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr)); - if (!iova) + if (!iova) { + if(!warn_once_unmap_single) + printk(KERN_WARNING "IOMMU: " + "Driver unmaps unmatched page at PFN %llx\n", + (unsigned long long)dev_addr); return; + } - start_addr = iova->pfn_lo << PAGE_SHIFT; - size = aligned_size((u64)dev_addr, size); + start_pfn = mm_to_dma_pfn(iova->pfn_lo); + last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; - pr_debug("Device %s unmapping: %lx@%llx\n", - pci_name(pdev), size, (unsigned long long)start_addr); + pr_debug("Device %s unmapping: pfn %lx-%lx\n", + pci_name(pdev), start_pfn, last_pfn); /* clear the whole page */ - dma_pte_clear_range(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_pfn, last_pfn); + /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, start_addr + size); + dma_pte_free_pagetable(domain, start_pfn, last_pfn); + if (intel_iommu_strict) { - iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT); + iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, + last_pfn - start_pfn + 1); /* free iova */ __free_iova(&domain->iovad, iova); } else { @@ -2447,7 +2762,7 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t s } static void *intel_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags) { void *vaddr; int order; @@ -2482,25 +2797,17 @@ static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, order); } -static inline struct page *sg_page(struct scatterlist *sg) -{ - return sg->page; -} - +static int warn_once_unmap_sg; static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, int dir) { - int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; - unsigned long start_addr; + unsigned long start_pfn, last_pfn; struct iova *iova; - size_t size = 0; - phys_addr_t addr; - struct scatterlist *sg; struct intel_iommu *iommu; - if (pdev->iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(hwdev)) return; domain = find_domain(pdev); @@ -2509,25 +2816,35 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, iommu = domain_get_iommu(domain); iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address)); - if (!iova) + if (!iova) { + if(!warn_once_unmap_sg) + printk(KERN_WARNING "IOMMU: " + "Driver unmaps unmatched sglist at PFN %llx\n", + (unsigned long long)sglist[0].dma_address); return; - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size += aligned_size((u64)addr, sg->length); } - start_addr = iova->pfn_lo << PAGE_SHIFT; + start_pfn = mm_to_dma_pfn(iova->pfn_lo); + last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; /* clear the whole page */ - dma_pte_clear_range(domain, start_addr, start_addr + size); - /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_pfn, last_pfn); - iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT); + /* free page tables */ + dma_pte_free_pagetable(domain, start_pfn, last_pfn); - /* free iova */ - __free_iova(&domain->iovad, iova); + if (intel_iommu_strict) { + iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, + last_pfn - start_pfn + 1); + /* free iova */ + __free_iova(&domain->iovad, iova); + } else { + add_unmap(domain, iova); + /* + * queue up the release of the unmap to save the 1/6th of the + * cpu used up by the iotlb flush operation... + */ + } } static int intel_nontranslate_map_sg(struct device *hddev, @@ -2547,21 +2864,20 @@ static int intel_nontranslate_map_sg(struct device *hddev, static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, int dir) { - phys_addr_t addr; int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; size_t size = 0; int prot = 0; - size_t offset = 0; + size_t offset_pfn = 0; struct iova *iova = NULL; int ret; struct scatterlist *sg; - unsigned long start_addr; + unsigned long start_vpfn; struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); - if (pdev->iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(hwdev)) return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir); domain = get_valid_domain_for_dev(pdev); @@ -2570,12 +2886,11 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne iommu = domain_get_iommu(domain); - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size += aligned_size((u64)addr, sg->length); - } + for_each_sg(sglist, sg, nelems, i) + size += aligned_nrpages(sg->offset, sg->length); - iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), + pdev->dma_mask); if (!iova) { sglist->dma_length = 0; return 0; @@ -2591,35 +2906,24 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) prot |= DMA_PTE_WRITE; - start_addr = iova->pfn_lo << PAGE_SHIFT; - offset = 0; - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size = aligned_size((u64)addr, sg->length); - ret = domain_page_mapping(domain, start_addr + offset, - ((u64)addr) & PAGE_MASK, - size, prot); - if (ret) { - /* clear the page */ - dma_pte_clear_range(domain, start_addr, - start_addr + offset); - /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, - start_addr + offset); - /* free iova */ - __free_iova(&domain->iovad, iova); - return 0; - } - sg->dma_address = start_addr + offset + - ((u64)addr & (~PAGE_MASK)); - sg->dma_length = sg->length; - offset += size; + start_vpfn = mm_to_dma_pfn(iova->pfn_lo); + + ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); + if (unlikely(ret)) { + /* clear the page */ + dma_pte_clear_range(domain, start_vpfn, + start_vpfn + size - 1); + /* free page tables */ + dma_pte_free_pagetable(domain, start_vpfn, + start_vpfn + size - 1); + /* free iova */ + __free_iova(&domain->iovad, iova); + return 0; } /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) - iommu_flush_iotlb_psi(iommu, 0, start_addr, - offset >> VTD_PAGE_SHIFT); + iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn); else iommu_flush_write_buffer(iommu); @@ -2777,8 +3081,8 @@ static int init_iommu_hw(void) DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); - iommu_disable_protect_mem_regions(iommu); iommu_enable_translation(iommu); + iommu_disable_protect_mem_regions(iommu); } return 0; @@ -2844,7 +3148,7 @@ static int iommu_resume(struct sys_device *dev) unsigned long flag; if (init_iommu_hw()) { - WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); + printk(KERN_WARNING "IOMMU setup failed, DMAR can not resume!\n"); return -EIO; } @@ -2916,7 +3220,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled) + if (no_iommu || swiotlb || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -2936,14 +3240,7 @@ int __init intel_iommu_init(void) init_timer(&unmap_timer); force_iommu = 1; - - if (!iommu_pass_through) { - printk(KERN_INFO - "Multi-level page-table translation for DMAR.\n"); - dma_ops = &intel_dma_ops; - } else - printk(KERN_INFO - "DMAR: Pass through translation for DMAR.\n"); + dma_ops = &intel_dma_ops; init_iommu_sysfs(); @@ -2952,31 +3249,6 @@ int __init intel_iommu_init(void) return 0; } -static int vm_domain_add_dev_info(struct dmar_domain *domain, - struct pci_dev *pdev) -{ - struct device_domain_info *info; - unsigned long flags; - - info = alloc_devinfo_mem(); - if (!info) - return -ENOMEM; - - info->segment = pci_domain_nr(pdev->bus); - info->bus = pdev->bus->number; - info->devfn = pdev->devfn; - info->dev = pdev; - info->domain = domain; - - spin_lock_irqsave(&device_domain_lock, flags); - list_add(&info->link, &domain->devices); - list_add(&info->global, &device_domain_list); - pdev->iommu = info; - spin_unlock_irqrestore(&device_domain_lock, flags); - - return 0; -} - static void iommu_detach_dependent_devices(struct intel_iommu *iommu, struct pci_dev *pdev) { @@ -3004,7 +3276,7 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu, } } -static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, +static void domain_remove_one_dev_info(struct dmar_domain *domain, struct pci_dev *pdev) { struct device_domain_info *info; @@ -3030,6 +3302,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, info->dev->iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, pdev); free_devinfo_mem(info); @@ -3080,6 +3353,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) spin_unlock_irqrestore(&device_domain_lock, flags1); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, info->dev); @@ -3135,12 +3409,11 @@ static struct dmar_domain *iommu_alloc_vm_domain(void) return domain; } -static int vm_domain_init(struct dmar_domain *domain, int guest_width) +static int md_domain_init(struct dmar_domain *domain, int guest_width) { int adjust_width; init_iova_domain(&domain->iovad, DMA_32BIT_PFN); - spin_lock_init(&domain->mapping_lock); spin_lock_init(&domain->iommu_lock); domain_reserve_special_ranges(domain); @@ -3154,6 +3427,7 @@ static int vm_domain_init(struct dmar_domain *domain, int guest_width) domain->iommu_count = 0; domain->iommu_coherency = 0; + domain->iommu_snooping = 0; domain->max_addr = 0; /* always allocate the top pgd */ @@ -3194,8 +3468,6 @@ static void iommu_free_vm_domain(struct dmar_domain *domain) static void vm_domain_exit(struct dmar_domain *domain) { - u64 end; - /* Domain 0 is reserved, so dont process it */ if (!domain) return; @@ -3203,14 +3475,12 @@ static void vm_domain_exit(struct dmar_domain *domain) vm_domain_remove_all_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); - end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~VTD_PAGE_MASK); /* clear ptes */ - dma_pte_clear_range(domain, 0, end); + dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ - dma_pte_free_pagetable(domain, 0, end); + dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); iommu_free_vm_domain(domain); free_domain_mem(domain); @@ -3226,7 +3496,7 @@ static int intel_iommu_domain_init(struct iommu_domain *domain) "intel_iommu_domain_init: dmar_domain == NULL\n"); return -ENOMEM; } - if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { printk(KERN_ERR "intel_iommu_domain_init() failed\n"); vm_domain_exit(dmar_domain); @@ -3253,7 +3523,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, struct intel_iommu *iommu; int addr_width; u64 end; - int ret; /* normally pdev is not mapped */ if (unlikely(domain_context_mapped(pdev))) { @@ -3261,8 +3530,9 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, old_domain = find_domain(pdev); if (old_domain) { - if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) - vm_domain_remove_one_dev_info(old_domain, pdev); + if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || + dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) + domain_remove_one_dev_info(old_domain, pdev); else domain_remove_dev_info(old_domain); } @@ -3284,12 +3554,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return -EFAULT; } - ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); - if (ret) - return ret; - - ret = vm_domain_add_dev_info(dmar_domain, pdev); - return ret; + return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); } static void intel_iommu_detach_device(struct iommu_domain *domain, @@ -3298,7 +3563,7 @@ static void intel_iommu_detach_device(struct iommu_domain *domain, struct dmar_domain *dmar_domain = domain->priv; struct pci_dev *pdev = to_pci_dev(dev); - vm_domain_remove_one_dev_info(dmar_domain, pdev); + domain_remove_one_dev_info(dmar_domain, pdev); } static int intel_iommu_map_range(struct iommu_domain *domain, @@ -3318,7 +3583,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain, if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) prot |= DMA_PTE_SNP; - max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size); + max_addr = iova + size; if (dmar_domain->max_addr < max_addr) { int min_agaw; u64 end; @@ -3336,8 +3601,11 @@ static int intel_iommu_map_range(struct iommu_domain *domain, } dmar_domain->max_addr = max_addr; } - - ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot); + /* Round up size to next multiple of PAGE_SIZE, if it and + the low bits of hpa would take us onto the next page */ + size = aligned_nrpages(hpa, size); + ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, + hpa >> VTD_PAGE_SHIFT, size, prot); return ret; } @@ -3345,15 +3613,15 @@ static void intel_iommu_unmap_range(struct iommu_domain *domain, unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = domain->priv; - dma_addr_t base; - /* The address might not be aligned */ - base = iova & VTD_PAGE_MASK; - size = VTD_PAGE_ALIGN(size); - dma_pte_clear_range(dmar_domain, base, base + size); + if (!size) + return; + + dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, + (iova + size - 1) >> VTD_PAGE_SHIFT); - if (dmar_domain->max_addr == base + size) - dmar_domain->max_addr = base; + if (dmar_domain->max_addr == iova + size) + dmar_domain->max_addr = iova; } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, @@ -3363,7 +3631,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, struct dma_pte *pte; u64 phys = 0; - pte = addr_to_dma_pte(dmar_domain, iova); + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT); if (pte) phys = dma_pte_addr(pte); @@ -3403,3 +3671,4 @@ static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); + diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c index 9c11867..4a55838 100644 --- a/drivers/pci/iova.c +++ b/drivers/pci/iova.c @@ -22,7 +22,6 @@ void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) { - spin_lock_init(&iovad->iova_alloc_lock); spin_lock_init(&iovad->iova_rbtree_lock); iovad->rbroot = RB_ROOT; iovad->cached32_node = NULL; @@ -205,7 +204,6 @@ alloc_iova(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool size_aligned) { - unsigned long flags; struct iova *new_iova; int ret; @@ -219,11 +217,9 @@ alloc_iova(struct iova_domain *iovad, unsigned long size, if (size_aligned) size = __roundup_pow_of_two(size); - spin_lock_irqsave(&iovad->iova_alloc_lock, flags); ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn, new_iova, size_aligned); - spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); if (ret) { free_iova_mem(new_iova); return NULL; @@ -381,8 +377,7 @@ reserve_iova(struct iova_domain *iovad, struct iova *iova; unsigned int overlap = 0; - spin_lock_irqsave(&iovad->iova_alloc_lock, flags); - spin_lock(&iovad->iova_rbtree_lock); + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) { if (__is_range_overlap(node, pfn_lo, pfn_hi)) { iova = container_of(node, struct iova, node); @@ -402,8 +397,7 @@ reserve_iova(struct iova_domain *iovad, iova = __insert_new_range(iovad, pfn_lo, pfn_hi); finish: - spin_unlock(&iovad->iova_rbtree_lock); - spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); return iova; } @@ -420,8 +414,7 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) unsigned long flags; struct rb_node *node; - spin_lock_irqsave(&from->iova_alloc_lock, flags); - spin_lock(&from->iova_rbtree_lock); + spin_lock_irqsave(&from->iova_rbtree_lock, flags); for (node = rb_first(&from->rbroot); node; node = rb_next(node)) { struct iova *iova = container_of(node, struct iova, node); struct iova *new_iova; @@ -430,6 +423,5 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) printk(KERN_ERR "Reserve iova range %lx@%lx failed\n", iova->pfn_lo, iova->pfn_lo); } - spin_unlock(&from->iova_rbtree_lock); - spin_unlock_irqrestore(&from->iova_alloc_lock, flags); + spin_unlock_irqrestore(&from->iova_rbtree_lock, flags); } diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 5619f85..194f569 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -39,4 +39,12 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) extern int dmar_disabled; +#ifdef CONFIG_DMA_API_DEBUG +extern void debug_dma_dump_mappings(struct device *dev); +#else +static inline void debug_dma_dump_mappings(struct device *dev) +{ +} +#endif /* CONFIG_DMA_API_DEBUG */ + #endif diff --git a/include/linux/iova.h b/include/linux/iova.h index 228f6c9..76a0759 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -28,7 +28,6 @@ struct iova { /* holds all the iova translations for a domain */ struct iova_domain { - spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */ spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ struct rb_root rbroot; /* iova domain rbtree root */ struct rb_node *cached32_node; /* Save last alloced node */