Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 1844

kernel-2.6.18-238.el5.src.rpm

From: Bhavna Sarathy <bnagendr@redhat.com>
Date: Thu, 5 Aug 2010 19:37:16 -0400
Subject: [misc] amd_iommu: add passthrough mode support
Message-id: <20100805194258.6739.17755.sendpatchset@localhost.localdomain>
Patchwork-id: 27425
O-Subject: [RHEL5.6 PATCH] Add AMD IOMMU passthrough mode support
Bugzilla: 561127
RH-Acked-by: Chris Wright <chrisw@redhat.com>

Resolves BZ 561127

This patch adds the AMD IOMMU passthrough mode support.   This patch underwent some review
in the tail end of RHEL5.5 with Chris Wright, Don Dutile.  All feedback I received at that
time has been incorporated.   We have similar support in RHEL6 as well.

Please note that previous versions of RHEL5 such as RHEL5.4 and RHEL5.5 can passthrough
devices to KVM.   The iommu=pt kernel command line switch only disables usage of the IOMMU
for the DMA-API.

This patch has been tested with the following testing data (system = dinar)
Test results:

Kernel:
    2.6.18-210.el5 + backported patch

Explicit kernel parameters:
    iommu=pt

IOMMU related kernel messages:
    AMD-Vi: Initialized for Passthrough Mode
    AMD IOMMU: Enabling IOMMU at 00:00.2cap 0x40

Device used for passthrough:
    Intel 82598EB 10-Gigabit AT Network Connection (rev 01)

Please review and ACK.

Signed-off-by: Jarod Wilson <jarod@redhat.com>

diff --git a/arch/i386/kernel/pci-dma-xen.c b/arch/i386/kernel/pci-dma-xen.c
index 7bf796e..b7776bf 100644
--- a/arch/i386/kernel/pci-dma-xen.c
+++ b/arch/i386/kernel/pci-dma-xen.c
@@ -50,6 +50,8 @@ int force_iommu __read_mostly= 0;
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly = 0;
 
+int iommu_pass_through __read_mostly = 0;
+
 void __init pci_iommu_alloc(void)
 {
 	/*
diff --git a/arch/x86_64/kernel/amd_iommu.c b/arch/x86_64/kernel/amd_iommu.c
index 6e19734..59da1f0 100644
--- a/arch/x86_64/kernel/amd_iommu.c
+++ b/arch/x86_64/kernel/amd_iommu.c
@@ -40,6 +40,12 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 static LIST_HEAD(iommu_pd_list);
 static DEFINE_SPINLOCK(iommu_pd_list_lock);
 
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+static struct protection_domain *pt_domain;
+
 #ifdef CONFIG_IOMMU_API
 static struct iommu_ops amd_iommu_ops;
 #endif
@@ -922,25 +928,41 @@ static struct protection_domain *domain_for_device(u16 devid)
  * If a device is not yet associated with a domain, this function does
  * assigns it visible for the hardware
  */
-static void attach_device(struct amd_iommu *iommu,
-			  struct protection_domain *domain,
-			  u16 devid)
+static void __attach_device(struct amd_iommu *iommu,
+			    struct protection_domain *domain,
+			    u16 devid)
 {
-	unsigned long flags;
-	u64 pte_root = virt_to_phys(domain->pt_root);
+	u64 pte_root;
 
-	domain->dev_cnt += 1;
+	/* lock domain */
+	spin_lock(&domain->lock);
+
+	pte_root = virt_to_phys(domain->pt_root);
 
 	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
-		    << DEV_ENTRY_MODE_SHIFT;
+		<< DEV_ENTRY_MODE_SHIFT;
 	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
 	amd_iommu_dev_table[devid].data[2] = domain->id;
+	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
 
 	amd_iommu_pd_table[devid] = domain;
+
+	domain->dev_cnt += 1;
+
+	/* ready */
+	spin_unlock(&domain->lock);
+}
+
+static void attach_device(struct amd_iommu *iommu,
+			  struct protection_domain *domain,
+			  u16 devid)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	__attach_device(iommu, domain, devid);
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 
 	/*
@@ -957,6 +979,7 @@ static void attach_device(struct amd_iommu *iommu,
  */
 static void __detach_device(struct protection_domain *domain, u16 devid)
 {
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
 
 	/* lock domain */
 	spin_lock(&domain->lock);
@@ -976,6 +999,19 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
 
 	/* ready */
 	spin_unlock(&domain->lock);
+
+	/*
+	 * If we run in passthrough mode the device must be assigned to the
+	 * passthrough domain if it is detached from any other domain.
+	 * Make sure we can deassign from the pt_domain itself.
+	 */
+	if (iommu_pass_through && domain != pt_domain)
+		__attach_device(iommu, pt_domain, devid);
+
+	iommu_queue_inv_dev_entry(iommu, devid);
+	if (domain->dev_cnt == 0)
+		iommu_flush_tlb_pde(iommu, domain->id);
+	iommu_completion_wait(iommu);
 }
 
 /*
@@ -1014,10 +1050,6 @@ static int device_change_notifier(struct notifier_block *nb,
 
 	domain = domain_for_device(devid);
 
-	if (domain && !dma_ops_domain(domain))
-		printk(KERN_ERR "AMD IOMMU WARNING: device already bound "
-		       "to a non-dma-ops domain\n");
-
 	switch (action) {
 	case BUS_NOTIFY_BOUND_DRIVER:
 		if (domain)
@@ -1033,6 +1065,8 @@ static int device_change_notifier(struct notifier_block *nb,
 	case BUS_NOTIFY_UNBIND_DRIVER:
 		if (!domain)
 			goto out;
+		if (iommu_pass_through)
+			break;
 		detach_device(domain, devid);
 		break;
 	case BUS_NOTIFY_ADD_DEVICE:
@@ -1065,6 +1099,11 @@ struct notifier_block device_nb = {
 	.notifier_call = device_change_notifier,
 };
 
+void amd_iommu_init_notifier(void)
+{
+	bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
 /*****************************************************************************
  *
  * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1703,6 +1742,11 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
 	.dma_supported = amd_iommu_dma_supported,
 };
 
+void __init amd_iommu_init_api(void)
+{
+	register_iommu(&amd_iommu_ops);
+}
+
 /*
  * The function which clues the AMD IOMMU driver into dma_ops.
  */
@@ -1748,10 +1792,6 @@ int __init amd_iommu_init_dma_ops(void)
 	/* Make the driver finally visible to the drivers */
 	dma_ops = &amd_iommu_dma_ops;
 
-	register_iommu(&amd_iommu_ops);
-
-	bus_register_notifier(&pci_bus_type, &device_nb);
-
 	amd_iommu_stats_init();
 
 	return 0;
@@ -1790,19 +1830,47 @@ static void cleanup_domain(struct protection_domain *domain)
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 }
 
-static int amd_iommu_domain_init(struct iommu_domain *dom)
+static void protection_domain_free(struct protection_domain *domain)
+{
+	if (!domain)
+		return;
+
+	if (domain->id)
+		domain_id_free(domain->id);
+
+	kfree(domain);
+}
+
+static struct protection_domain *protection_domain_alloc(void)
 {
 	struct protection_domain *domain;
 
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 	if (!domain)
-		return -ENOMEM;
+		return NULL;
 
 	spin_lock_init(&domain->lock);
-	domain->mode = PAGE_MODE_3_LEVEL;
 	domain->id = domain_id_alloc();
 	if (!domain->id)
+		goto out_err;
+
+	return domain;
+
+out_err:
+	kfree(domain);
+
+	return NULL;
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+	struct protection_domain *domain;
+
+	domain = protection_domain_alloc();
+	if (!domain)
 		goto out_free;
+
+	domain->mode    = PAGE_MODE_3_LEVEL;
 	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!domain->pt_root)
 		goto out_free;
@@ -1812,7 +1880,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
 	return 0;
 
 out_free:
-	kfree(domain);
+	protection_domain_free(domain);
 
 	return -ENOMEM;
 }
@@ -1986,3 +2054,48 @@ static struct iommu_ops amd_iommu_ops = {
 	.unmap = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 };
+
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+int __init amd_iommu_init_passthrough(void)
+{
+	struct pci_dev *dev = NULL;
+	u16 devid, devid2;
+
+	/* allocate passthroug domain */
+	pt_domain = protection_domain_alloc();
+	if (!pt_domain)
+		return -ENOMEM;
+
+	pt_domain->mode |= PAGE_MODE_NONE;
+
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		struct amd_iommu *iommu;
+
+		devid = calc_devid(dev->bus->number, dev->devfn);
+		if (devid > amd_iommu_last_bdf)
+			continue;
+
+		devid2 = amd_iommu_alias_table[devid];
+
+		iommu = amd_iommu_rlookup_table[devid2];
+		if (!iommu)
+			continue;
+
+		__attach_device(iommu, pt_domain, devid);
+		__attach_device(iommu, pt_domain, devid2);
+	}
+
+	pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
+
+	return 0;
+}
+
diff --git a/arch/x86_64/kernel/amd_iommu_init.c b/arch/x86_64/kernel/amd_iommu_init.c
index 595ac5c..fb40704 100644
--- a/arch/x86_64/kernel/amd_iommu_init.c
+++ b/arch/x86_64/kernel/amd_iommu_init.c
@@ -1110,12 +1110,23 @@ int __init amd_iommu_init(void)
 	if (ret)
 		goto free;
 
-	ret = amd_iommu_init_dma_ops();
+	if (iommu_pass_through)
+		ret = amd_iommu_init_passthrough();
+	else
+		ret = amd_iommu_init_dma_ops();
+
 	if (ret)
 		goto free;
 
+	amd_iommu_init_api();
+
+	amd_iommu_init_notifier();
+
 	enable_iommus();
 
+	if (iommu_pass_through)
+		goto out;
+
 	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
 			(1 << (amd_iommu_aperture_order-20)));
 
@@ -1174,6 +1185,9 @@ void __init amd_iommu_detect(void)
 	if (swiotlb || no_iommu || (iommu_detected && !iommu_aperture))
 		return;
 
+	if (iommu_pass_through)
+		swiotlb = 1;
+
 	if (acpi_table_parse(ACPI_IVRS, early_amd_iommu_detect) != 0) {
 		iommu_detected = 1;
 		amd_iommu_detected = 1;
diff --git a/include/asm-x86_64/amd_iommu.h b/include/asm-x86_64/amd_iommu.h
index cbf6829..305ec35 100644
--- a/include/asm-x86_64/amd_iommu.h
+++ b/include/asm-x86_64/amd_iommu.h
@@ -25,10 +25,13 @@
 #ifdef CONFIG_AMD_IOMMU
 extern int amd_iommu_init(void);
 extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data,
 					 struct pt_regs *regs);
 extern void amd_iommu_apply_erratum_63(u16 devid);
+extern void amd_iommu_init_api(void);
+extern void amd_iommu_init_notifier(void);
 #else
 static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
diff --git a/include/asm-x86_64/amd_iommu_types.h b/include/asm-x86_64/amd_iommu_types.h
index e6ca58b..7317402 100644
--- a/include/asm-x86_64/amd_iommu_types.h
+++ b/include/asm-x86_64/amd_iommu_types.h
@@ -143,6 +143,7 @@
 #define EVT_BUFFER_SIZE		8192 /* 512 entries */
 #define EVT_LEN_MASK		(0x9ULL << 56)
 
+#define PAGE_MODE_NONE    0x00
 #define PAGE_MODE_1_LEVEL 0x01
 #define PAGE_MODE_2_LEVEL 0x02
 #define PAGE_MODE_3_LEVEL 0x03
@@ -194,6 +195,8 @@
 #define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
+#define PD_PASSTHROUGH_MASK	(1UL << 2) /* domain has no page
+					      translation */
 
 /*
  * This structure contains generic data for  IOMMU protection domains