Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4557

kernel-2.6.18-194.11.1.el5.src.rpm

From: ddugger@redhat.com <ddugger@redhat.com>
Date: Tue, 21 Apr 2009 13:01:22 -0600
Subject: [xen] VT-d: enhance MTRR/PAT virtualization
Message-id: 200904211901.n3LJ1Mcx028378@sobek.n0ano.com
O-Subject: [RHEL5.4 PATCH 1/6 V2] BZ496873: VT-d: Enhance MTRR/PAT virtualization
Bugzilla: 496873
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com>
RH-Acked-by: Chris Lalancette <clalance@redhat.com>
RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com>
RH-Acked-by: Justin M. Forbes <jforbes@redhat.com>
RH-Acked-by: Justin M. Forbes <jforbes@redhat.com>

VT-d: Enhance MTRR/PAT virtualization when EPT/VT-d both enabled, and
utilise the snoop control capability of the VT-d engine, also some
cleanup in the VT-d and EPT code.

Difference from Xen-unstable:
1)epte_get_entry_emt() is not in the xen/arch/x86/hvm/mtrr.c file,
  since the file doesn't exist in xen-3.1.
  So I put it in the p2m-ept.c file.
2)some justification in the epte_get_entry_emt() is removed now,
  since some of the conditions don't exist in xen-3.1.

Upstream Status: Accepted (CS 19079, 19154, 19165, 19198)

Signed-off-by: Xiaohui Xin <xiaohui.xin@intel.com>
Signed-off-by: Gerd Hoffman <kraxel@redhat.com>
Signed-off-by: Don Dugger <donald.d.dugger@intel.com>

Yet Another Resend - resolve the conflicting BZs between the Subject line
(correct) and the message body (incorrect).

diff --git a/arch/x86/mm/p2m-ept.c b/arch/x86/mm/p2m-ept.c
index e5420f5..4fb80e3 100644
--- a/arch/x86/mm/p2m-ept.c
+++ b/arch/x86/mm/p2m-ept.c
@@ -24,6 +24,7 @@
 #include <asm/domain.h>
 #include <asm/hvm/vmx/vmx.h>
 #include <xen/iocap.h>
+#include <asm/mtrr.h>
 
 #if 1 /* XEN_VERSION == 3 && XEN_SUBVERSION < 2 */
 
@@ -72,6 +73,38 @@ static mfn_t compat_ept_get_entry_fast(unsigned long gfn)
 
 #endif
 
+uint8_t epte_get_entry_emt(
+    struct domain *d, unsigned long gfn,
+    unsigned long mfn, uint8_t *igmt, int direct_mmio)
+{
+    struct vcpu *v = current;
+
+    *igmt = 0;
+
+    if ( (current->domain != d) && ((v = d->vcpu[0]) == NULL) )
+        return MTRR_TYPE_WRBACK;
+
+    if ( !mfn_valid(mfn) )
+        return MTRR_TYPE_UNCACHABLE;
+
+    if ( !iommu_enabled )
+    {
+        *igmt = 1;
+        return MTRR_TYPE_WRBACK;
+    }
+
+    if ( direct_mmio )
+        return MTRR_TYPE_UNCACHABLE;
+
+    if ( iommu_snoop )
+    {
+        *igmt = 1;
+        return MTRR_TYPE_WRBACK;
+    }
+
+    return MTRR_TYPE_WRBACK; 
+}
+
 static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type)
 {
     switch(type)
@@ -81,7 +114,6 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type)
             return;
         case p2m_mmio_direct:
              entry->r = entry->w = entry->x = 1;
-             entry->emt = 0x8;
             return;
         case p2m_ram_logdirty:
         case p2m_ram_ro:
@@ -112,6 +144,7 @@ static int ept_set_middle_entry(struct domain *d, ept_entry_t *ept_entry)
     list_add_tail(&pg->list, &d->arch.p2m.pages);
 
     ept_entry->emt = 0;
+    ept_entry->igmt = 0;
     ept_entry->sp_avail = 0;
     ept_entry->avail1 = 0;
     ept_entry->mfn = page_to_mfn(pg);
@@ -170,6 +203,9 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
     u32 index;
     int i, rv = 0, ret = 0;
     int walk_level = order / EPT_TABLE_ORDER;
+    int direct_mmio = (p2mt == p2m_mmio_direct);
+    uint8_t igmt = 0; 
+    int need_modify_vtd_table = 1;
 
     /* We only support 4k and 2m pages now */
 
@@ -203,26 +239,31 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
     {
         if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
         {
-            /* Track the highest gfn for which we have ever had a valid mapping */
-            if ( gfn > d->arch.p2m.max_mapped_pfn )
-                d->arch.p2m.max_mapped_pfn = gfn;
-
-            if ( p2mt == p2m_mmio_direct )
-                ept_entry->emt = 0x8;
-            else
-                ept_entry->emt = EPT_DEFAULT_MT;
+            ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn),
+                                &igmt, direct_mmio);
+            ept_entry->igmt = igmt;
             ept_entry->sp_avail = walk_level ? 1 : 0;
 
             if ( ret == GUEST_TABLE_SUPER_PAGE )
             {
-                ept_entry->mfn = mfn_x(mfn) - offset;
+                if ( ept_entry->mfn == (mfn_x(mfn) - offset) )
+                    need_modify_vtd_table = 0;
+                else
+                    ept_entry->mfn = mfn_x(mfn) - offset;
+
                 if ( ept_entry->avail1 == p2m_ram_logdirty &&
                   p2mt == p2m_ram_rw )
                     for ( i = 0; i < (1UL << order); i++ )
                         paging_mark_dirty(d, mfn_x(mfn)-offset+i);
             }
             else
-                ept_entry->mfn = mfn_x(mfn);
+            {
+                if ( ept_entry->mfn == mfn_x(mfn) )
+                    need_modify_vtd_table = 0;
+                else
+                    ept_entry->mfn = mfn_x(mfn);
+            }
+
 
             ept_entry->avail1 = p2mt;
             ept_entry->rsvd = 0;
@@ -260,10 +301,11 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
         for ( i = 0; i < 512; i++ )
         {
             split_ept_entry = split_table + i;
-            if ( p2mt == p2m_mmio_direct )
-                split_ept_entry->emt = 0x8;
-            else
-                split_ept_entry->emt = EPT_DEFAULT_MT;
+            split_ept_entry->emt = epte_get_entry_emt(d,
+                                        gfn-offset+i, split_mfn+i,
+                                        &igmt, direct_mmio);
+            split_ept_entry->igmt = igmt;
+
             split_ept_entry->sp_avail =  0;
 
             split_ept_entry->mfn = split_mfn+i;
@@ -278,17 +320,25 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 
         /* Set the destinated 4k page as normal */
         split_ept_entry = split_table + offset;
-        if ( p2mt == p2m_mmio_direct )
-            split_ept_entry->emt = 0x8;
+        split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn),
+                                                &igmt, direct_mmio);
+        split_ept_entry->igmt = igmt;
+        if ( split_ept_entry->mfn == mfn_x(mfn) )
+            need_modify_vtd_table = 0;
         else
-            split_ept_entry->emt = EPT_DEFAULT_MT;
-        split_ept_entry->mfn = mfn_x(mfn);
+            split_ept_entry->mfn = mfn_x(mfn);
+
         split_ept_entry->avail1 = p2mt;
         ept_p2m_type_to_flags(split_ept_entry, p2mt);
 
         unmap_domain_page(split_table);
     }
 
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn))
+         && (gfn + (1UL << order) - 1 > d->arch.p2m.max_mapped_pfn) )
+        d->arch.p2m.max_mapped_pfn = gfn + (1UL << order) - 1;
+
     /* Success */
     rv = 1;
 
@@ -297,7 +347,8 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
     ept_sync_domain(d);
 
     /* support pci pass-through */
-    if ( iommu_enabled )
+    if ( iommu_enabled && is_hvm_domain(d) 
+            && need_modify_vtd_table)
     {
         if ( p2mt == p2m_ram_rw )
         {
diff --git a/drivers/passthrough/iommu.c b/drivers/passthrough/iommu.c
index 2661d2b..0cd3aa9 100644
--- a/drivers/passthrough/iommu.c
+++ b/drivers/passthrough/iommu.c
@@ -29,16 +29,20 @@ int intel_vtd_setup(void);
  *   off|no|false|disable       Disable IOMMU (default)
  *   force|required             Don't boot unless IOMMU is enabled
  *   passthrough                Bypass VT-d translation for Dom0
+ *   snoop                      Utilize the snoop control for IOMMU (default)
+ *   no-snoop                   Dont utilize the snoop control for IOMMU
  */
 custom_param("iommu", parse_iommu_param);
 int iommu_enabled = 0;
 int force_iommu = 0;
 int iommu_passthrough = 0;
+int iommu_snoop = 0;
 
 static void __init parse_iommu_param(char *s)
 {
     char *ss;
     iommu_enabled = 1;
+    iommu_snoop = 1;
 
     do {
         ss = strchr(s, ',');
@@ -52,6 +56,10 @@ static void __init parse_iommu_param(char *s)
             force_iommu = 1;
         else if ( !strcmp(s, "passthrough") )
             iommu_passthrough = 1;
+        else if ( !strcmp(s, "snoop") )
+            iommu_snoop = 1;
+        else if ( !strcmp(s, "no-snoop") )
+            iommu_snoop = 0;
 
         s = ss + 1;
     } while ( ss );
diff --git a/drivers/passthrough/vtd/dmar.c b/drivers/passthrough/vtd/dmar.c
index 4e20aef..63e4e0b 100644
--- a/drivers/passthrough/vtd/dmar.c
+++ b/drivers/passthrough/vtd/dmar.c
@@ -29,6 +29,7 @@
 #include <xen/pci_regs.h>
 #include <asm/string.h>
 #include "dmar.h"
+#include "iommu.h"
 
 int vtd_enabled = 1;
 
diff --git a/drivers/passthrough/vtd/iommu.c b/drivers/passthrough/vtd/iommu.c
index 6a44091..1705e2f 100644
--- a/drivers/passthrough/vtd/iommu.c
+++ b/drivers/passthrough/vtd/iommu.c
@@ -1473,6 +1473,11 @@ int intel_iommu_map_page(
     pte_present = dma_pte_present(*pte);
     dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
     dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
+
+    /* Set the SNP on leaf page table if Snoop Control available */
+    if ( iommu_snoop )
+        dma_set_pte_snp(*pte);
+
     iommu_flush_cache_entry(pte);
     spin_unlock(&hd->mapping_lock);
     unmap_vtd_domain_page(page);
@@ -1746,12 +1751,29 @@ int intel_vtd_setup(void)
     if ( init_vtd_hw() )
         goto error;
 
+    /* Giving that all devices within guest use same io page table,
+     * enable snoop control only if all VT-d engines support it.
+     */
+    if ( iommu_snoop )
+    {
+        for_each_drhd_unit ( drhd )
+        {
+            iommu = drhd->iommu;
+            if ( !ecap_snp_ctl(iommu->ecap) ) {
+                iommu_snoop = 0;
+                break;
+            }
+        }
+    }
+
+    printk("Intel VT-d snoop control %sabled\n", iommu_snoop ? "en" : "dis");
     return 0;
 
  error:
     for_each_drhd_unit ( drhd )
         iommu_free(drhd);
     vtd_enabled = 0;
+    iommu_snoop = 0;
     return -ENOMEM;
 }
 
diff --git a/drivers/passthrough/vtd/iommu.h b/drivers/passthrough/vtd/iommu.h
index 5037e57..d5582ba 100644
--- a/drivers/passthrough/vtd/iommu.h
+++ b/drivers/passthrough/vtd/iommu.h
@@ -104,6 +104,7 @@
 #define ecap_ext_intr(e)         ((e >> 4) & 0x1)
 #define ecap_cache_hints(e)      ((e >> 5) & 0x1)
 #define ecap_pass_thru(e)        ((e >> 6) & 0x1)
+#define ecap_snp_ctl(e)          ((e >> 7) & 0x1)
 
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
@@ -260,10 +261,14 @@ struct dma_pte {
 };
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
+#define DMA_PTE_SNP  (1 << 11)
+
 #define dma_clear_pte(p)    do {(p).val = 0;} while(0)
 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0)
 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0)
 #define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0)
+#define dma_set_pte_snp(p)  do {(p).val |= DMA_PTE_SNP;} while(0)
+
 #define dma_set_pte_prot(p, prot) \
             do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
 #define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
diff --git a/include/asm-x86/hvm/vmx/vmx.h b/include/asm-x86/hvm/vmx/vmx.h
index aa30520..c40f092 100644
--- a/include/asm-x86/hvm/vmx/vmx.h
+++ b/include/asm-x86/hvm/vmx/vmx.h
@@ -35,7 +35,8 @@ typedef union {
         u64 r       :   1,
         w           :   1,
         x           :   1,
-        emt         :   4,
+        emt         :   3,
+        igmt        :   1,
         sp_avail    :   1,
         avail1      :   4,
         mfn         :   45,
diff --git a/include/xen/iommu.h b/include/xen/iommu.h
index 440d9b6..19f21c5 100644
--- a/include/xen/iommu.h
+++ b/include/xen/iommu.h
@@ -31,6 +31,7 @@ extern int iommu_enabled;
 extern int iommu_pv_enabled;
 extern int force_iommu;
 extern int iommu_passthrough;
+extern int iommu_snoop;
 
 #define domain_hvm_iommu(d)     (&d->arch.hvm_domain.hvm_iommu)