Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2799

kernel-2.6.18-194.11.1.el5.src.rpm

From: Don Dutile <ddutile@redhat.com>
Date: Tue, 1 Dec 2009 23:42:36 -0500
Subject: [pci] intel-iommu: IOTLB flushing mods & ATSR support
Message-id: <4B15A9EC.2030802@redhat.com>
Patchwork-id: 21629
O-Subject: [RHEL5.5 PATCH V2] 1/9: INTEL-IOMMU: IOTLB flushing mods & ATSR
	(device IOTLB) support]
Bugzilla: 516811 518103
RH-Acked-by: Chris Wright <chrisw@redhat.com>
RH-Acked-by: Prarit Bhargava <prarit@redhat.com>

BZ 516727

0/9 states build & tests.

Attachment summarizes commit-id's backported.

For V2, pulled in patches from latter patches that made
sources as similar as possible to 2.6.32.
Dropped patch hunk in drivers/pci/iov.c that was already
backpored in -173/174.
Dropped include/acpi/actbl1.h and moved it's patch to 7/9 where it belonged.

Note: although ATSR isn't supported anywhere,
       the function calls are heavily intertwined in the
       next patch, such that extracting it would be a
       nightmare, asking for a serious branch from upstream
       and difficulty backporting future bug fixes.

Please review and ack.
- Don
>From 98f0a3fad08bb61c4225628c920cca1c10af9bf5 Mon Sep 17 00:00:00 2001
From: Donald Dutile <ddutile@redhat.com>
Date: Fri, 13 Nov 2009 15:28:32 -0500
Subject: [PATCH 1/9]   INTEL-IOMMU: IOTLB flushing mods & ATSR (device IOTLB) support

  Backport of following 2.6.31 commits:
  93a23a7271dfb811b3adb72779054c3a24433112 VT-d: support the device IOTLB
  9dd2fe89062c90a964d122b8be5615d6f2203bbe VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
  6ba6c3a4cacfd68bf970e3e04e2ff0d66fa0f695 VT-d: add device IOTLB invalidation support
  aa5d2b515b6fca5f8a56eac84f7fa0a68c1ce9b7 VT-d: parse ATSR in DMA Remapping Reporting Structure
  e277d2fc79d6abb86fafadb58dca0b9c498a9aa7 PCI: handle Virtual Function ATS enabling
  302b4215daa0a704c843da40fd2529e5757a72da PCI: support the ATS capability
  dd7264355a203c3456dbba04db471947d3b55e7e intel-iommu: dmar_set_interrupt return error value
  c416daa98a584596df21ee2c26fac6579ee58f57 intel-iommu: Tidy up iommu->gcmd handling
  462b60f6ccc685f7e8aa04ff430e6b4ffedf629f intel-iommu: Fix tiny theoretical race in write-buffer flush.
  1f0ef2aa18802a8ce7eb5a5164aaaf4d59073801 intel-iommu: Clean up handling of "caching mode" vs. IOTLB flushing.

  First set of intel-iommu update from 2.6.32-rc4 (+ select cherry picks)

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 0351fe7..0b2ce5b 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -268,6 +268,84 @@ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
 	}
 	return ret;
 }
+
+static LIST_HEAD(dmar_atsr_units);
+
+static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
+
+	atsru->hdr = hdr;
+	atsru->include_all = atsr->flags & 0x1;
+
+	list_add(&atsru->list, &dmar_atsr_units);
+
+	return 0;
+}
+
+static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+{
+	int rc;
+	struct acpi_dmar_atsr *atsr;
+
+	if (atsru->include_all)
+		return 0;
+
+	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+	rc = dmar_parse_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt, &atsru->devices,
+				atsr->segment);
+	if (rc || !atsru->devices_cnt) {
+		list_del(&atsru->list);
+		kfree(atsru);
+	}
+
+	return rc;
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+	int i;
+	struct pci_bus *bus;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment == pci_domain_nr(dev->bus))
+			goto found;
+	}
+
+	return 0;
+
+found:
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		struct pci_dev *bridge = bus->self;
+
+		if (!bridge || !bridge->is_pcie ||
+		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+
+		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+			for (i = 0; i < atsru->devices_cnt; i++)
+				if (atsru->devices[i] == bridge)
+					return 1;
+			break;
+		}
+	}
+
+	if (atsru->include_all)
+		return 1;
+
+	return 0;
+}
 #endif
 
 static void __init
@@ -275,22 +353,28 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 {
 	struct acpi_dmar_hardware_unit *drhd;
 	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_atsr *atsr;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-		drhd = (struct acpi_dmar_hardware_unit *)header;
+		drhd = container_of(header, struct acpi_dmar_hardware_unit,
+				    header);
 		printk (KERN_INFO PREFIX
-			"DRHD (flags: 0x%08x)base: 0x%016Lx\n",
-			drhd->flags, (unsigned long long)drhd->address);
+			"DRHD base: %#016Lx flags: %#x\n",
+			(unsigned long long)drhd->address, drhd->flags);
 		break;
 	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-		rmrr = (struct acpi_dmar_reserved_memory *)header;
-
+		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+				    header);
 		printk (KERN_INFO PREFIX
-			"RMRR base: 0x%016Lx end: 0x%016Lx\n",
+			"RMRR base: %#016Lx end: %#016Lx\n",
 			(unsigned long long)rmrr->base_address,
 			(unsigned long long)rmrr->end_address);
 		break;
+	case ACPI_DMAR_TYPE_ATSR:
+		atsr = container_of(header, struct acpi_dmar_atsr, header);
+		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
+		break;
 	}
 }
 
@@ -353,6 +437,11 @@ parse_dmar_table(void)
 			ret = dmar_parse_one_rmrr(entry_header);
 #endif
 			break;
+		case ACPI_DMAR_TYPE_ATSR:
+#ifdef CONFIG_DMAR
+			ret = dmar_parse_one_atsr(entry_header);
+#endif
+			break;
 		default:
 			printk(KERN_WARNING PREFIX
 				"Unknown DMAR structure type\n");
@@ -421,11 +510,19 @@ int __init dmar_dev_scope_init(void)
 #ifdef CONFIG_DMAR
 	{
 		struct dmar_rmrr_unit *rmrr, *rmrr_n;
+		struct dmar_atsr_unit *atsr, *atsr_n;
+
 		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
 			ret = rmrr_parse_dev(rmrr);
 			if (ret)
 				return ret;
 		}
+
+		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
+			ret = atsr_parse_dev(atsr);
+			if (ret)
+				return ret;
+		}
 	}
 #endif
 
@@ -458,6 +555,9 @@ int __init dmar_table_init(void)
 #ifdef CONFIG_DMAR
 	if (list_empty(&dmar_rmrr_units))
 		printk(KERN_INFO PREFIX "No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		printk(KERN_INFO PREFIX "No ATSR found\n");
 #endif
 
 #ifdef CONFIG_INTR_REMAP
@@ -588,7 +688,8 @@ void free_iommu(struct intel_iommu *iommu)
  */
 static inline void reclaim_free_desc(struct q_inval *qi)
 {
-	while (qi->desc_status[qi->free_tail] == QI_DONE) {
+	while (qi->desc_status[qi->free_tail] == QI_DONE ||
+	       qi->desc_status[qi->free_tail] == QI_ABORT) {
 		qi->desc_status[qi->free_tail] = QI_FREE;
 		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
 		qi->free_cnt++;
@@ -598,10 +699,13 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 static int qi_check_fault(struct intel_iommu *iommu, int index)
 {
 	u32 fault;
-	int head;
+	int head, tail;
 	struct q_inval *qi = iommu->qi;
 	int wait_index = (index + 1) % QI_LENGTH;
 
+	if (qi->desc_status[wait_index] == QI_ABORT)
+		return -EAGAIN;
+
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
 
 	/*
@@ -611,7 +715,11 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 	 */
 	if (fault & DMA_FSTS_IQE) {
 		head = readl(iommu->reg + DMAR_IQH_REG);
-		if ((head >> 4) == index) {
+		if ((head >> DMAR_IQ_SHIFT) == index) {
+			printk(KERN_ERR "VT-d detected invalid descriptor: "
+				"low=%llx, high=%llx\n",
+				(unsigned long long)qi->desc[index].low,
+				(unsigned long long)qi->desc[index].high);
 			memcpy(&qi->desc[index], &qi->desc[wait_index],
 					sizeof(struct qi_desc));
 			__iommu_flush_cache(iommu, &qi->desc[index],
@@ -621,6 +729,32 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 		}
 	}
 
+	/*
+	 * If ITE happens, all pending wait_desc commands are aborted.
+	 * No new descriptors are fetched until the ITE is cleared.
+	 */
+	if (fault & DMA_FSTS_ITE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+		head |= 1;
+		tail = readl(iommu->reg + DMAR_IQT_REG);
+		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+
+		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+		do {
+			if (qi->desc_status[head] == QI_IN_USE)
+				qi->desc_status[head] = QI_ABORT;
+			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+		} while (head != tail);
+
+		if (qi->desc_status[wait_index] == QI_ABORT)
+			return -EAGAIN;
+	}
+
+	if (fault & DMA_FSTS_ICE)
+		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
 	return 0;
 }
 
@@ -630,7 +764,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
  */
 int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 {
-	int rc = 0;
+	int rc;
 	struct q_inval *qi = iommu->qi;
 	struct qi_desc *hw, wait_desc;
 	int wait_index, index;
@@ -641,6 +775,9 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 
 	hw = qi->desc;
 
+restart:
+	rc = 0;
+
 	spin_lock_irqsave(&qi->q_lock, flags);
 	while (qi->free_cnt < 3) {
 		spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -671,7 +808,7 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	 * update the HW tail register indicating the presence of
 	 * new descriptors.
 	 */
-	writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG);
+	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
 
 	while (qi->desc_status[wait_index] != QI_DONE) {
 		/*
@@ -683,18 +820,20 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		 */
 		rc = qi_check_fault(iommu, index);
 		if (rc)
-			goto out;
+			break;
 
 		spin_unlock(&qi->q_lock);
 		cpu_relax();
 		spin_lock(&qi->q_lock);
 	}
-out:
-	qi->desc_status[index] = qi->desc_status[wait_index] = QI_DONE;
+	qi->desc_status[index] = QI_DONE;
 
 	reclaim_free_desc(qi);
 	spin_unlock_irqrestore(&qi->q_lock, flags);
 
+	if (rc == -EAGAIN)
+		goto restart;
+
 	return rc;
 }
 
@@ -712,41 +851,26 @@ void qi_global_iec(struct intel_iommu *iommu)
 	qi_submit_sync(&desc, iommu);
 }
 
-int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		     u64 type, int non_present_entry_flush)
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		      u64 type)
 {
 	struct qi_desc desc;
 
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
 			| QI_CC_GRAN(type) | QI_CC_TYPE;
 	desc.high = 0;
 
-	return qi_submit_sync(&desc, iommu);
+	qi_submit_sync(&desc, iommu);
 }
 
-int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-		   unsigned int size_order, u64 type,
-		   int non_present_entry_flush)
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+		    unsigned int size_order, u64 type)
 {
 	u8 dw = 0, dr = 0;
 
 	struct qi_desc desc;
 	int ih = 0;
 
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	if (cap_write_drain(iommu->cap))
 		dw = 1;
 
@@ -758,7 +882,28 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
 		| QI_IOTLB_AM(size_order);
 
-	return qi_submit_sync(&desc, iommu);
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			u64 addr, unsigned mask)
+{
+	struct qi_desc desc;
+
+	if (mask) {
+		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
+		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+	} else
+		desc.high = QI_DEV_IOTLB_ADDR(addr);
+
+	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+		qdep = 0;
+
+	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+		   QI_DIOTLB_TYPE;
+
+	qi_submit_sync(&desc, iommu);
 }
 
 /*
@@ -788,7 +933,6 @@ void dmar_disable_qi(struct intel_iommu *iommu)
 		cpu_relax();
 
 	iommu->gcmd &= ~DMA_GCMD_QIE;
-
 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
 
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
@@ -802,7 +946,7 @@ end:
  */
 static void __dmar_enable_qi(struct intel_iommu *iommu)
 {
-	u32 cmd, sts;
+	u32 sts;
 	unsigned long flags;
 	struct q_inval *qi = iommu->qi;
 
@@ -816,9 +960,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu)
 
 	dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
 
-	cmd = iommu->gcmd | DMA_GCMD_QIE;
 	iommu->gcmd |= DMA_GCMD_QIE;
-	writel(cmd, iommu->reg + DMAR_GCMD_REG);
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
 
 	/* Make sure hardware complete it */
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index b9894f3..88eb575 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -252,6 +252,7 @@ struct device_domain_info {
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
 	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
 	struct dmar_domain *domain; /* pointer to domain */
 };
 
@@ -821,7 +822,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 static void iommu_set_root_entry(struct intel_iommu *iommu)
 {
 	void *addr;
-	u32 cmd, sts;
+	u32 sts;
 	unsigned long flag;
 
 	addr = iommu->root_entry;
@@ -829,8 +830,7 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
 	spin_lock_irqsave(&iommu->register_lock, flag);
 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 
-	cmd = iommu->gcmd | DMA_GCMD_SRTP;
-	writel(cmd, iommu->reg + DMAR_GCMD_REG);
+	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 
 	/* Make sure hardware complete it */
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
@@ -846,39 +846,25 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 
 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 		return;
-	val = iommu->gcmd | DMA_GCMD_WBF;
 
 	spin_lock_irqsave(&iommu->register_lock, flag);
-	writel(val, iommu->reg + DMAR_GCMD_REG);
+	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 
 	/* Make sure hardware complete it */
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-			readl, (!(val & DMA_GSTS_WBFS)), val);
+		      readl, (!(val & DMA_GSTS_WBFS)), val);
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
 /* return value determine if we need a write buffer flush */
-static int __iommu_flush_context(struct intel_iommu *iommu,
-	u16 did, u16 source_id, u8 function_mask, u64 type,
-	int non_present_entry_flush)
+static void __iommu_flush_context(struct intel_iommu *iommu,
+				  u16 did, u16 source_id, u8 function_mask,
+				  u64 type)
 {
 	u64 val = 0;
 	unsigned long flag;
 
-	/*
-	 * In the non-present entry flush case, if hardware doesn't cache
-	 * non-present entry we do nothing and if hardware cache non-present
-	 * entry, we flush entries of domain 0 (the domain id is used to cache
-	 * any non-present entries)
-	 */
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	switch (type) {
 	case DMA_CCMD_GLOBAL_INVL:
 		val = DMA_CCMD_GLOBAL_INVL;
@@ -903,33 +889,16 @@ static int __iommu_flush_context(struct intel_iommu *iommu,
 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-	/* flush context entry will implicitly flush write buffer */
-	return 0;
 }
 
 /* return value determine if we need a write buffer flush */
-static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
-	u64 addr, unsigned int size_order, u64 type,
-	int non_present_entry_flush)
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+			     u64 addr, unsigned int size_order, u64 type)
 {
 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
 	u64 val = 0, val_iva = 0;
 	unsigned long flag;
 
-	/*
-	 * In the non-present entry flush case, if hardware doesn't cache
-	 * non-present entry we do nothing and if hardware cache non-present
-	 * entry, we flush entries of domain 0 (the domain id is used to cache
-	 * any non-present entries)
-	 */
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	switch (type) {
 	case DMA_TLB_GLOBAL_FLUSH:
 		/* global flush doesn't need set IVA_REG */
@@ -977,37 +946,102 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
 			(unsigned long long)DMA_TLB_IIRG(type),
 			(unsigned long long)DMA_TLB_IAIG(val));
-	/* flush iotlb entry will implicitly flush write buffer */
-	return 0;
 }
 
-static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-	u64 addr, unsigned int pages, int non_present_entry_flush)
+static struct device_domain_info *iommu_support_dev_iotlb(
+	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
+{
+	int found = 0;
+	unsigned long flags;
+	struct device_domain_info *info;
+	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
+
+	if (!ecap_dev_iotlb_support(iommu->ecap))
+		return NULL;
+
+	if (!iommu->qi)
+		return NULL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->bus == bus && info->devfn == devfn) {
+			found = 1;
+			break;
+		}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (!found || !info->dev)
+		return NULL;
+
+	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
+		return NULL;
+
+	if (!dmar_find_matched_atsr_unit(info->dev))
+		return NULL;
+
+	info->iommu = iommu;
+
+	return info;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info)
+		return;
+
+	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info->dev || !pci_ats_enabled(info->dev))
+		return;
+
+	pci_disable_ats(info->dev);
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+				  u64 addr, unsigned mask)
+{
+	u16 sid, qdep;
+	unsigned long flags;
+	struct device_domain_info *info;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev || !pci_ats_enabled(info->dev))
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		qdep = pci_ats_queue_depth(info->dev);
+		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+				u64 addr, unsigned int pages)
 {
-	unsigned int mask;
+	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
 
 	BUG_ON(addr & (~VTD_PAGE_MASK));
 	BUG_ON(pages == 0);
 
-	/* Fallback to domain selective flush if no PSI support */
-	if (!cap_pgsel_inv(iommu->cap))
-		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH,
-						non_present_entry_flush);
-
-	/*
+	/* 
+	 * Fallback to domain selective flush if no PSI support or the size is
+	 * too big.
 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
 	 * aligned to the size
 	 */
-	mask = ilog2(__roundup_pow_of_two(pages));
-	/* Fallback to domain selective flush if size is too big */
-	if (mask > cap_max_amask_val(iommu->cap))
-		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-			DMA_TLB_DSI_FLUSH, non_present_entry_flush);
+	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
+		iommu->flush.flush_iotlb(iommu, did, 0, 0,
+					 DMA_TLB_DSI_FLUSH);
+	else
+		iommu->flush.flush_iotlb(iommu, did, addr, mask,
+		                         DMA_TLB_DSI_FLUSH);
 
-	return iommu->flush.flush_iotlb(iommu, did, addr, mask,
-					DMA_TLB_PSI_FLUSH,
-					non_present_entry_flush);
+	if (did)
+		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1033,13 +1067,13 @@ static int iommu_enable_translation(struct intel_iommu *iommu)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iommu->register_lock, flags);
+	iommu->gcmd |= DMA_GCMD_TE;
 	writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
 
 	/* Make sure hardware complete it */
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		readl, (sts & DMA_GSTS_TES), sts);
+		      readl, (sts & DMA_GSTS_TES), sts);
 
-	iommu->gcmd |= DMA_GCMD_TE;
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 	return 0;
 }
@@ -1055,7 +1089,7 @@ static int iommu_disable_translation(struct intel_iommu *iommu)
 
 	/* Make sure hardware complete it */
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		readl, (!(sts & DMA_GSTS_TES)), sts);
+		      readl, (!(sts & DMA_GSTS_TES)), sts);
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 	return 0;
@@ -1348,6 +1382,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	unsigned long ndomains;
 	int id;
 	int agaw;
+	struct device_domain_info *info = NULL;
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1415,29 +1450,45 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 
 	context_set_domain_id(context, id);
 
+	if (translation != CONTEXT_TT_PASS_THROUGH) {
+		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
+		translation = info ? CONTEXT_TT_DEV_IOTLB :
+				     CONTEXT_TT_MULTI_LEVEL;
+	}
+
 	/*
 	 * In pass through mode, AW must be programmed to indicate the largest
 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
 	 */
-	if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
-		context_set_address_width(context, iommu->agaw);
-		context_set_address_root(context, virt_to_phys(pgd));
-	} else
+	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) 
 		context_set_address_width(context, iommu->msagaw);
+	else {
+		context_set_address_root(context, virt_to_phys(pgd));
+		context_set_address_width(context, iommu->agaw);
+	}
 
 	context_set_translation_type(context, translation);
 	context_set_fault_enable(context);
 	context_set_present(context);
 	domain_flush_cache(domain, context, sizeof(*context));
 
-	/* it's a non-present to present mapping */
-	if (iommu->flush.flush_context(iommu, domain->id,
-		(((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
-		DMA_CCMD_DEVICE_INVL, 1))
+	/*
+	 * It's a non-present to present mapping. If hardware doesn't cache
+	 * non-present entry we only need to flush the write-buffer. If the
+	 * _does_ cache non-present entries, then it does so in the special
+	 * domain #0, which we have to flush:
+	 */
+	if (cap_caching_mode(iommu->cap)) {
+		iommu->flush.flush_context(iommu, 0,
+					   (((u16)bus) << 8) | devfn,
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
+	} else {
 		iommu_flush_write_buffer(iommu);
-	else
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
+	}
 
+	iommu_enable_dev_iotlb(info);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	spin_lock_irqsave(&domain->iommu_lock, flags);
@@ -1568,9 +1619,8 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
 
 	clear_context_table(iommu, bus, devfn);
 	iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL, 0);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH, 0);
+					   DMA_CCMD_GLOBAL_INVL);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 static void domain_remove_dev_info(struct dmar_domain *domain)
@@ -1589,6 +1639,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 			info->dev->iommu = NULL;
 		spin_unlock_irqrestore(&device_domain_lock, flags);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		free_devinfo_mem(info);
@@ -2117,10 +2168,8 @@ static int __init init_dmars(void)
 
 		iommu_set_root_entry(iommu);
 
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
-					   0);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
-					 0);
+		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 		iommu_disable_protect_mem_regions(iommu);
 
 		ret = iommu_enable_translation(iommu);
@@ -2267,10 +2316,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 	if (ret)
 		goto error;
 
-	/* it's a non-present to present mapping */
-	ret = iommu_flush_iotlb_psi(iommu, domain->id,
-			start_paddr, size >> VTD_PAGE_SHIFT, 1);
-	if (ret)
+	/* it's a non-present to present mapping nly flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, 0, start_paddr, 
+				      size >> VTD_PAGE_SHIFT);
+	else
 		iommu_flush_write_buffer(iommu);
 
 	return start_paddr + ((u64)paddr & (~PAGE_MASK));
@@ -2302,15 +2352,16 @@ static void flush_unmaps(void)
 		if (!iommu)
 			continue;
 
-		if (deferred_flush[i].next) {
-			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						 DMA_TLB_GLOBAL_FLUSH, 0);
-			for (j = 0; j < deferred_flush[i].next; j++) {
-				__free_iova(&deferred_flush[i].domain[j]->iovad,
-						deferred_flush[i].iova[j]);
-			}
-			deferred_flush[i].next = 0;
+		if (!deferred_flush[i].next)
+			continue;
+
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH);
+		for (j = 0; j < deferred_flush[i].next; j++) {
+			__free_iova(&deferred_flush[i].domain[j]->iovad,
+				    deferred_flush[i].iova[j]);
 		}
+		deferred_flush[i].next = 0;
 	}
 
 	list_size = 0;
@@ -2382,9 +2433,8 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t s
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 	if (intel_iommu_strict) {
-		if (iommu_flush_iotlb_psi(iommu,
-			domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
-			iommu_flush_write_buffer(iommu);
+		iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
+				      size >> VTD_PAGE_SHIFT);
 		/* free iova */
 		__free_iova(&domain->iovad, iova);
 	} else {
@@ -2473,9 +2523,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 
-	if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
-			size >> VTD_PAGE_SHIFT, 0))
-		iommu_flush_write_buffer(iommu);
+	iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
+			      size >> VTD_PAGE_SHIFT);
 
 	/* free iova */
 	__free_iova(&domain->iovad, iova);
@@ -2567,10 +2616,13 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
 		offset += size;
 	}
 
-	/* it's a non-present to present mapping */
-	if (iommu_flush_iotlb_psi(iommu, domain->id,
-			start_addr, offset >> VTD_PAGE_SHIFT, 1))
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, 0, start_addr,
+				      offset >> VTD_PAGE_SHIFT);
+	else
 		iommu_flush_write_buffer(iommu);
+
 	return nelems;
 }
 
@@ -2722,9 +2774,9 @@ static int init_iommu_hw(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL, 0);
+					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 		iommu_disable_protect_mem_regions(iommu);
 		iommu_enable_translation(iommu);
 	}
@@ -2739,9 +2791,9 @@ static void iommu_flush_all(void)
 
 	for_each_active_iommu(iommu, drhd) {
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL, 0);
+					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 	}
 }
 
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 2c9111b..c3b92af 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -5,6 +5,7 @@
  *
  * PCI Express I/O Virtualization (IOV) support.
  *   Single Root IOV 1.0
+ *   Address Translation Service 1.0
  */
 
 #include <linux/pci.h>
@@ -304,12 +305,17 @@ static void sriov_disable(struct pci_dev *dev)
 static int sriov_init(struct pci_dev *dev, int pos)
 {
 	int i;
+	int rc;
 	u32 pgsz;
 	u16 ctrl, total, offset, stride;
 	struct pci_sriov *iov;
 	struct resource *res;
 	struct pci_dev *pdev;
 
+	if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
+	    dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
+		return -ENODEV;
+
 	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
 	if (ctrl & PCI_SRIOV_CTRL_VFE) {
 		pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
@@ -357,8 +363,8 @@ found:
 		if (!res->flags)
 			continue;
 		if ((res->end - res->start + 1) & (PAGE_SIZE - 1)) {
-			kfree(iov);
-			return -EIO;
+			rc = -EIO;
+			goto failed;
 		}
 		res->end = res->start + (res->end - res->start + 1) * total - 1;
 	}
@@ -377,26 +383,35 @@ found:
 
 	if (pdev)
 		iov->dev = pci_dev_get(pdev);
-	else {
+	else
 		iov->dev = dev;
-		mutex_init(&iov->lock);
-	}
+
+	mutex_init(&iov->lock);
 
 	dev->sriov = iov;
 	dev->is_physfn = 1;
 
 	return 0;
+
+failed:
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = iov->res + i;
+		res->flags = 0;
+	}
+	kfree(iov);
+
+	return rc;
 }
 
 static void sriov_release(struct pci_dev *dev)
 {
 	BUG_ON(dev->sriov->nr_virtfn);
 
-	if (dev == dev->sriov->dev)
-		mutex_destroy(&dev->sriov->lock);
-	else
+	if (dev != dev->sriov->dev)
 		pci_dev_put(dev->sriov->dev);
 
+	mutex_destroy(&dev->sriov->lock);
+
 	kfree(dev->sriov);
 	dev->sriov = NULL;
 }
@@ -431,6 +446,9 @@ int pci_iov_init(struct pci_dev *dev)
 {
 	int pos;
 
+	if (!dev->is_pcie)
+		return -ENODEV;
+
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
 	if (pos)
 		return sriov_init(dev, pos);
@@ -507,6 +525,7 @@ int pci_iov_bus_range(struct pci_bus *bus)
 /**
  * pci_enable_sriov - enable the SR-IOV capability
  * @dev: the PCI device
+ * @nr_virtfn: number of virtual functions to enable
  *
  * Returns 0 on success, or negative on failure.
  */
@@ -535,3 +554,145 @@ void pci_disable_sriov(struct pci_dev *dev)
 	sriov_disable(dev);
 }
 EXPORT_SYMBOL_GPL(pci_disable_sriov);
+
+static int ats_alloc_one(struct pci_dev *dev, int ps)
+{
+	int pos;
+	u16 cap;
+	struct pci_ats *ats;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	ats = kzalloc(sizeof(*ats), GFP_KERNEL);
+	if (!ats)
+		return -ENOMEM;
+
+	ats->pos = pos;
+	ats->stu = ps;
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+	ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+					    PCI_ATS_MAX_QDEP;
+	dev->ats = ats;
+
+	return 0;
+}
+
+static void ats_free_one(struct pci_dev *dev)
+{
+	kfree(dev->ats);
+	dev->ats = NULL;
+}
+
+/**
+ * pci_enable_ats - enable the ATS capability
+ * @dev: the PCI device
+ * @ps: the IOMMU page shift
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	int rc;
+	u16 ctrl;
+
+	BUG_ON(dev->ats && dev->ats->is_enabled);
+
+	if (ps < PCI_ATS_MIN_STU)
+		return -EINVAL;
+
+	if (dev->is_physfn || dev->is_virtfn) {
+		struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn;
+
+		mutex_lock(&pdev->sriov->lock);
+		if (pdev->ats)
+			rc = pdev->ats->stu == ps ? 0 : -EINVAL;
+		else
+			rc = ats_alloc_one(pdev, ps);
+
+		if (!rc)
+			pdev->ats->ref_cnt++;
+		mutex_unlock(&pdev->sriov->lock);
+		if (rc)
+			return rc;
+	}
+
+	if (!dev->is_physfn) {
+		rc = ats_alloc_one(dev, ps);
+		if (rc)
+			return rc;
+	}
+
+	ctrl = PCI_ATS_CTRL_ENABLE;
+	if (!dev->is_virtfn)
+		ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU);
+	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+
+	dev->ats->is_enabled = 1;
+
+	return 0;
+}
+
+/**
+ * pci_disable_ats - disable the ATS capability
+ * @dev: the PCI device
+ */
+void pci_disable_ats(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	BUG_ON(!dev->ats || !dev->ats->is_enabled);
+
+	pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl);
+	ctrl &= ~PCI_ATS_CTRL_ENABLE;
+	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+
+	dev->ats->is_enabled = 0;
+
+	if (dev->is_physfn || dev->is_virtfn) {
+		struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn;
+
+		mutex_lock(&pdev->sriov->lock);
+		pdev->ats->ref_cnt--;
+		if (!pdev->ats->ref_cnt)
+			ats_free_one(pdev);
+		mutex_unlock(&pdev->sriov->lock);
+	}
+
+	if (!dev->is_physfn)
+		ats_free_one(dev);
+}
+
+/**
+ * pci_ats_queue_depth - query the ATS Invalidate Queue Depth
+ * @dev: the PCI device
+ *
+ * Returns the queue depth on success, or negative on failure.
+ *
+ * The ATS spec uses 0 in the Invalidate Queue Depth field to
+ * indicate that the function can accept 32 Invalidate Request.
+ * But here we use the `real' values (i.e. 1~32) for the Queue
+ * Depth; and 0 indicates the function shares the Queue with
+ * other functions (doesn't exclusively own a Queue).
+ */
+int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	int pos;
+	u16 cap;
+
+	if (dev->is_virtfn)
+		return 0;
+
+	if (dev->ats)
+		return dev->ats->qdep;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+
+	return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+				       PCI_ATS_MAX_QDEP;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4c088b6..b594290 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -140,6 +140,15 @@ struct pci_sriov {
 	struct resource res[PCI_SRIOV_NUM_BARS]; /* VF BAR resource */
 };
 
+/* Address Translation Service */
+struct pci_ats {
+        int pos;        /* capability position */
+        int stu;        /* Smallest Translation Unit */
+        int qdep;       /* Invalidate Queue Depth */
+        int ref_cnt;    /* Physical Function reference count */
+        int is_enabled:1;       /* Enable bit is set */
+};
+
 #ifdef CONFIG_PCI_IOV
 extern int pci_iov_init(struct pci_dev *dev);
 extern void pci_iov_release(struct pci_dev *dev);
@@ -147,6 +156,20 @@ extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 				enum pci_bar_type *type);
 extern void pci_restore_iov_state(struct pci_dev *dev);
 extern int pci_iov_bus_range(struct pci_bus *bus);
+
+extern int pci_enable_ats(struct pci_dev *dev, int ps);
+extern void pci_disable_ats(struct pci_dev *dev);
+extern int pci_ats_queue_depth(struct pci_dev *dev);
+/**
+ * pci_ats_enabled - query the ATS status
+ * @dev: the PCI device
+ *
+ * Returns 1 if ATS capability is enabled, or 0 if not.
+ */
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return dev->ats && dev->ats->is_enabled;
+}
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -168,5 +191,21 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 {
 	return 0;
 }
+
+static inline int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_ats(struct pci_dev *dev)
+{
+}
+static inline int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_IOV */
 struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index b0e9536..f1d473b 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -300,10 +300,32 @@ struct acpi_dmar_reserved_memory {
 	u64 end_address;	/* 4_k aligned limit address */
 };
 
-/* Flags */
+/* Masks for Flags field above */
 
 #define ACPI_DMAR_ALLOW_ALL         (1)
 
+/* 2: Root Port ATS Capability Reporting Structure */
+
+struct acpi_dmar_atsr {
+	struct acpi_dmar_header header;
+	u8 flags;
+	u8 reserved;
+	u16 segment;
+};
+
+/* Masks for Flags field above */
+
+#define ACPI_DMAR_ALL_PORTS         (1)
+
+/* 3: Remapping Hardware Static Affinity Structure */
+
+struct acpi_dmar_rhsa {
+	struct acpi_dmar_header header;
+	u32 reserved;
+	u64 base_address;
+	u32 proximity_domain;
+};
+
 /*******************************************************************************
  *
  * ECDT - Embedded Controller Boot Resources Table
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index e0a03af..5619f85 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -14,6 +14,7 @@
 #define DMA_PTE_SNP (1 << 11)
 
 #define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_DEV_IOTLB	1
 #define CONTEXT_TT_PASS_THROUGH 2
 
 struct intel_iommu;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 56d5435..d3cf68a 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -193,6 +193,15 @@ struct dmar_rmrr_unit {
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+struct dmar_atsr_unit {
+	struct list_head list;		/* list of ATSR units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct pci_dev **devices;	/* target devices */
+	int devices_cnt;		/* target device count */
+	u8 include_all:1;		/* include all ports */
+};
+
 /* Intel DMAR  initialization functions */
 extern int intel_iommu_init(void);
 #else
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 90fa94b..950ad6a 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -62,6 +62,7 @@
 #define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
 #define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
 #define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
@@ -132,6 +133,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_pass_through(e)	((e >> 6) & 0x1)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
 #define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
 
@@ -207,6 +209,8 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_FSTS_PPF ((u32)2)
 #define DMA_FSTS_PFO ((u32)1)
 #define DMA_FSTS_IQE (1 << 4)
+#define DMA_FSTS_ICE (1 << 5)
+#define DMA_FSTS_ITE (1 << 6)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
@@ -235,7 +239,8 @@ do {									\
 enum {
 	QI_FREE,
 	QI_IN_USE,
-	QI_DONE
+	QI_DONE,
+	QI_ABORT
 };
 
 #define QI_CC_TYPE		0x1
@@ -264,6 +269,12 @@ enum {
 #define QI_CC_DID(did)		(((u64)did) << 16)
 #define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
 
+#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
+#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
+#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_SIZE	1
+#define QI_DEV_IOTLB_MAX_INVS	32
+
 struct qi_desc {
 	u64 low, high;
 };
@@ -290,10 +301,10 @@ struct ir_table {
 #endif
 
 struct iommu_flush {
-	int (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		u64 type, int non_present_entry_flush);
-	int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
-		unsigned int size_order, u64 type, int non_present_entry_flush);
+	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
+			      u8 fm, u64 type);
+	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
+			    unsigned int size_order, u64 type);
 };
 
 enum {
@@ -340,6 +351,7 @@ static inline void __iommu_flush_cache(
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
@@ -348,12 +360,15 @@ extern void dmar_disable_qi(struct intel_iommu *iommu);
 extern int dmar_reenable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
-extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-			        u8 fm, u64 type, int non_present_entry_flush);
-extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-			  unsigned int size_order, u64 type,
-			  int non_present_entry_flush);
+extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
+			     u8 fm, u64 type);
+extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+			  unsigned int size_order, u64 type);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			       u64 addr, unsigned mask);
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
+extern int dmar_ir_support(void);
+
 #endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4640084..0c1c803 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -119,6 +119,9 @@ struct pci_cap_saved_state {
 	u32 data[0];
 };
 
+struct pci_sriov;
+struct pci_ats;
+
 /*
  * The pci_dev structure is used to describe PCI devices.
  */
@@ -208,7 +211,10 @@ struct pci_dev {
 #ifdef CONFIG_DMAR
 	void *iommu; /* hook for IOMMU specific extension */
 #endif
+#ifdef CONFIG_PCI_IOV
+	struct pci_ats *ats; /* Address Translation Service */
 #endif
+#endif /* !__GENKSYMS__ */
 };
 
 #define pci_dev_g(n) list_entry(n, struct pci_dev, global_list)
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 2bbc5bb..3742fcb 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -431,6 +431,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_ATS	15
 #define PCI_EXT_CAP_ID_SRIOV	16
 
 /* Advanced Error Reporting */
@@ -549,6 +550,15 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Address Translation Service */
+#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
+#define  PCI_ATS_CAP_QDEP(x)	((x) & 0x1f)	/* Invalidate Queue Depth */
+#define  PCI_ATS_MAX_QDEP	32	/* Max Invalidate Queue Depth */
+#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
+#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
+#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f)	/* Smallest Translation Unit */
+#define  PCI_ATS_MIN_STU	12	/* shift of minimum STU block */
+
 /* Single Root I/O Virtualization */
 #define PCI_SRIOV_CAP		0x04	/* SR-IOV Capabilities */
 #define  PCI_SRIOV_CAP_VFM	0x01	/* VF Migration Capable */