Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4537

kernel-2.6.18-194.11.1.el5.src.rpm

From: ddugger@redhat.com <ddugger@redhat.com>
Date: Mon, 23 Mar 2009 10:23:29 -0600
Subject: [xen] MSI supprt internal functions
Message-id: 200903231623.n2NGNTA3022132@sobek.n0ano.com
O-Subject: [RHEL5.4 PATCH 17/21 V2] xen: MSI supprt internal functions
Bugzilla: 484227
RH-Acked-by: Gerd Hoffmann <kraxel@redhat.com>

Add MSI and MSI-X helper functions in Xen.
Also add additional fixmap mapping for MSI-X table.

[Version 2 update: fix by kraxel, don't use syscall & hypercall vectors]

Upstream Status: Accepted (CS 17535, 17942)

BZ: 484227

Signed-off-by: Qing He <qing.he@intel.com>
Signed-off-by: Gerd Hoffman <kraxel@redhat.com>
Signed-off-by: Don Dugger <donald.d.dugger@intel.com>

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 732e2e8..3c2ddfe 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -47,6 +47,7 @@ obj-y += x86_emulate.o
 obj-y += machine_kexec.o
 obj-y += crash.o
 obj-y += pci.o
+obj-y += msi.o
 
 obj-$(crash_debug) += gdbstub.o
 
diff --git a/arch/x86/i8259.c b/arch/x86/i8259.c
index 2069319..0874fad 100644
--- a/arch/x86/i8259.c
+++ b/arch/x86/i8259.c
@@ -405,6 +405,10 @@ void __init init_IRQ(void)
         irq_desc[LEGACY_VECTOR(i)].handler = &i8259A_irq_type;
     }
 
+    /* Never allocate the hypercall vector or Linux/BSD fast-trap vector. */
+    vector_irq[HYPERCALL_VECTOR] = NEVER_ASSIGN;
+    vector_irq[0x80] = NEVER_ASSIGN;
+
     apic_intr_init();
 
     /* Set the clock to HZ Hz */
diff --git a/arch/x86/io_apic.c b/arch/x86/io_apic.c
index 5f46652..c450b16 100644
--- a/arch/x86/io_apic.c
+++ b/arch/x86/io_apic.c
@@ -27,16 +27,17 @@
 #include <xen/delay.h>
 #include <xen/sched.h>
 #include <xen/acpi.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
 #include <xen/keyhandler.h>
 #include <asm/io.h>
 #include <asm/mc146818rtc.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
+#include <asm/msi.h>
 #include <mach_apic.h>
 #include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
+#include <public/physdev.h>
 
 /* Different to Linux: our implementation can be simpler. */
 #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
@@ -86,7 +87,8 @@ static struct irq_pin_list {
 } irq_2_pin[PIN_MAP_SIZE];
 static int irq_2_pin_free_entry = NR_IRQS;
 
-int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = {
+    [0 ... NR_VECTORS - 1] = FREE_TO_ASSIGN};
 
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -663,42 +665,49 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
+u8 irq_vector[NR_IRQS] __read_mostly;
+
+int free_irq_vector(int vector)
+{
+    int irq;
+
+    BUG_ON((vector > LAST_DYNAMIC_VECTOR) || (vector < FIRST_DYNAMIC_VECTOR));
+
+    spin_lock(&vector_lock);
+    if ((irq = vector_irq[vector]) == AUTO_ASSIGN)
+        vector_irq[vector] = FREE_TO_ASSIGN;
+    spin_unlock(&vector_lock);
+
+    return (irq == AUTO_ASSIGN) ? 0 : -EINVAL;
+}
 
 int assign_irq_vector(int irq)
 {
-    static unsigned current_vector = FIRST_DYNAMIC_VECTOR, offset = 0;
+    static unsigned current_vector = FIRST_DYNAMIC_VECTOR;
     unsigned vector;
 
-    BUG_ON(irq >= NR_IRQ_VECTORS);
+    BUG_ON(irq >= NR_IRQS);
+
     spin_lock(&vector_lock);
 
-    if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
+    if ((irq != AUTO_ASSIGN) && (IO_APIC_VECTOR(irq) > 0)) {
         spin_unlock(&vector_lock);
         return IO_APIC_VECTOR(irq);
     }
 
-next:
-    current_vector += 8;
-
-    /* Skip the hypercall vector. */
-    if (current_vector == HYPERCALL_VECTOR)
-        goto next;
-
-    /* Skip the Linux/BSD fast-trap vector. */
-    if (current_vector == 0x80)
-        goto next;
+    vector = current_vector;
+    while (vector_irq[vector] != FREE_TO_ASSIGN) {
+        vector += 8;
+        if (vector > LAST_DYNAMIC_VECTOR)
+            vector = FIRST_DYNAMIC_VECTOR + ((vector + 1) & 7);
 
-    if (current_vector > LAST_DYNAMIC_VECTOR) {
-        offset++;
-        if (!(offset%8)) {
+        if (vector == current_vector) {
             spin_unlock(&vector_lock);
             return -ENOSPC;
         }
-        current_vector = FIRST_DYNAMIC_VECTOR + offset;
     }
 
-    vector = current_vector;
+    current_vector = vector;
     vector_irq[vector] = irq;
     if (irq != AUTO_ASSIGN)
         IO_APIC_VECTOR(irq) = vector;
@@ -1104,6 +1113,7 @@ void disable_IO_APIC(void)
         entry.delivery_mode   = dest_ExtINT; /* ExtInt */
         entry.vector          = 0;
         entry.dest.physical.physical_dest =
+        //  TODO: BP: should be get_apic_id
             GET_APIC_ID(apic_read(APIC_ID));
 
         /*
@@ -1532,6 +1542,50 @@ static struct hw_interrupt_type ioapic_level_type = {
     .set_affinity 	= set_ioapic_affinity_vector,
 };
 
+static unsigned int startup_msi_vector(unsigned int vector)
+{
+    unmask_msi_vector(vector);
+    return 0;
+}
+
+static void ack_msi_vector(unsigned int vector)
+{
+    if ( msi_maskable_irq(irq_desc[vector].msi_desc) )
+        ack_APIC_irq(); /* ACKTYPE_NONE */
+}
+
+static void end_msi_vector(unsigned int vector)
+{
+    if ( !msi_maskable_irq(irq_desc[vector].msi_desc) )
+        ack_APIC_irq(); /* ACKTYPE_EOI */
+}
+
+static void shutdown_msi_vector(unsigned int vector)
+{
+    mask_msi_vector(vector);
+}
+
+static void set_msi_affinity_vector(unsigned int vector, cpumask_t cpu_mask)
+{
+    set_native_irq_info(vector, cpu_mask);
+    set_msi_affinity(vector, cpu_mask);
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+struct hw_interrupt_type pci_msi_type = {
+    .typename   = "PCI-MSI",
+    .startup    = startup_msi_vector,
+    .shutdown   = shutdown_msi_vector,
+    .enable	    = unmask_msi_vector,
+    .disable    = mask_msi_vector,
+    .ack        = ack_msi_vector,
+    .end        = end_msi_vector,
+    .set_affinity   = set_msi_affinity_vector,
+};
+
 static inline void init_IO_APIC_traps(void)
 {
     int irq;
@@ -1649,6 +1703,9 @@ static inline void check_timer(void)
 {
     int apic1, pin1, apic2, pin2;
     int vector;
+    unsigned long flags;
+
+    local_irq_save(flags);
 
     /*
      * get/set the timer IRQ vector:
@@ -1690,6 +1747,7 @@ static inline void check_timer(void)
          */
         unmask_IO_APIC_irq(0);
         if (timer_irq_works()) {
+            local_irq_restore(flags);
             if (disable_timer_pin_1 > 0)
                 clear_IO_APIC_pin(apic1, pin1);
             return;
@@ -1707,6 +1765,7 @@ static inline void check_timer(void)
          */
         setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
         if (timer_irq_works()) {
+            local_irq_restore(flags);
             printk("works.\n");
             if (pin1 != -1)
                 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
@@ -1734,6 +1793,7 @@ static inline void check_timer(void)
     enable_8259A_irq(0);
 
     if (timer_irq_works()) {
+        local_irq_restore(flags);
         printk(" works.\n");
         return;
     }
@@ -1749,6 +1809,8 @@ static inline void check_timer(void)
 
     unlock_ExtINT_logic();
 
+    local_irq_restore(flags);
+
     if (timer_irq_works()) {
         printk(" works.\n");
         return;
@@ -2128,7 +2190,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
     if ( new_rte.vector >= FIRST_DYNAMIC_VECTOR )
         new_irq = vector_irq[new_rte.vector];
 
-    if ( (old_irq != new_irq) && (old_irq != -1) && IO_APIC_IRQ(old_irq) )
+    if ( (old_irq != new_irq) && (old_irq >= 0) && IO_APIC_IRQ(old_irq) )
     {
         if ( irq_desc[IO_APIC_VECTOR(old_irq)].action )
         {
@@ -2140,7 +2202,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
         remove_pin_at_irq(old_irq, apic, pin);
     }
 
-    if ( (new_irq != -1) && IO_APIC_IRQ(new_irq) )
+    if ( (new_irq >= 0) && IO_APIC_IRQ(new_irq) )
     {
         if ( irq_desc[IO_APIC_VECTOR(new_irq)].action )
         {
diff --git a/arch/x86/msi.c b/arch/x86/msi.c
new file mode 100644
index 0000000..d968ba6
--- /dev/null
+++ b/arch/x86/msi.c
@@ -0,0 +1,786 @@
+/*
+ * File:    msi.c
+ * Purpose: PCI Message Signaled Interrupt (MSI)
+ *
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/irq.h>
+#include <xen/delay.h>
+#include <xen/sched.h>
+#include <xen/acpi.h>
+#include <xen/errno.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <xen/keyhandler.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/msi.h>
+#include <asm/fixmap.h>
+#include <mach_apic.h>
+#include <io_ports.h>
+#include <public/physdev.h>
+#include <xen/iommu.h>
+
+/* bitmap indicate which fixed map is free */
+DEFINE_SPINLOCK(msix_fixmap_lock);
+DECLARE_BITMAP(msix_fixmap_pages, MAX_MSIX_PAGES);
+
+static int msix_fixmap_alloc(void)
+{
+    int i, rc = -1;
+
+    spin_lock(&msix_fixmap_lock);
+    for ( i = 0; i < MAX_MSIX_PAGES; i++ )
+        if ( !test_bit(i, &msix_fixmap_pages) )
+            break;
+    if ( i == MAX_MSIX_PAGES )
+        goto out;
+    rc = FIX_MSIX_IO_RESERV_BASE + i;
+    set_bit(i, &msix_fixmap_pages);
+
+ out:
+    spin_unlock(&msix_fixmap_lock);
+    return rc;
+}
+
+static void msix_fixmap_free(int idx)
+{
+    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
+        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
+}
+
+/*
+ * MSI message composition
+ */
+static void msi_compose_msg(struct pci_dev *pdev, int vector,
+                            struct msi_msg *msg)
+{
+    unsigned dest;
+    cpumask_t tmp;
+
+    tmp = TARGET_CPUS;
+    if ( vector )
+    {
+        dest = cpu_mask_to_apicid(tmp);
+
+        msg->address_hi = MSI_ADDR_BASE_HI;
+        msg->address_lo =
+            MSI_ADDR_BASE_LO |
+            ((INT_DEST_MODE == 0) ?
+             MSI_ADDR_DESTMODE_PHYS:
+             MSI_ADDR_DESTMODE_LOGIC) |
+            ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+             MSI_ADDR_REDIRECTION_CPU:
+             MSI_ADDR_REDIRECTION_LOWPRI) |
+            MSI_ADDR_DEST_ID(dest);
+
+        msg->data =
+            MSI_DATA_TRIGGER_EDGE |
+            MSI_DATA_LEVEL_ASSERT |
+            ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+             MSI_DATA_DELIVERY_FIXED:
+             MSI_DATA_DELIVERY_LOWPRI) |
+            MSI_DATA_VECTOR(vector);
+    }
+}
+
+static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+    switch ( entry->msi_attrib.type )
+    {
+    case PCI_CAP_ID_MSI:
+    {
+        struct pci_dev *dev = entry->dev;
+        int pos = entry->msi_attrib.pos;
+        u16 data;
+        u8 bus = dev->bus;
+        u8 slot = PCI_SLOT(dev->devfn);
+        u8 func = PCI_FUNC(dev->devfn);
+
+        msg->address_lo = pci_conf_read32(bus, slot, func,
+                                          msi_lower_address_reg(pos));
+        if ( entry->msi_attrib.is_64 )
+        {
+            msg->address_hi = pci_conf_read32(bus, slot, func,
+                                              msi_upper_address_reg(pos));
+            data = pci_conf_read16(bus, slot, func, msi_data_reg(pos, 1));
+        }
+        else
+        {
+            msg->address_hi = 0;
+            data = pci_conf_read16(bus, slot, func, msi_data_reg(pos, 0));
+        }
+        msg->data = data;
+        break;
+    }
+    case PCI_CAP_ID_MSIX:
+    {
+        void __iomem *base;
+        base = entry->mask_base +
+            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+        msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+        msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+        msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+        break;
+    }
+    default:
+        BUG();
+    }
+
+
+//  TODO: BP: interrupt remapping
+//    if ( vtd_enabled )
+//        msi_msg_read_remap_rte(entry, msg);
+}
+
+static int set_vector_msi(struct msi_desc *entry)
+{
+    if ( entry->vector >= NR_VECTORS )
+    {
+        dprintk(XENLOG_ERR, "Trying to install msi data for Vector %d\n",
+                entry->vector);
+        return -EINVAL;
+    }
+
+    irq_desc[entry->vector].msi_desc = entry;
+    return 0;
+}
+
+static int unset_vector_msi(int vector)
+{
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+
+    if ( vector >= NR_VECTORS )
+    {
+        dprintk(XENLOG_ERR, "Trying to uninstall msi data for Vector %d\n",
+                vector);
+        return -EINVAL;
+    }
+
+    irq_desc[vector].msi_desc = NULL;
+
+    return 0;
+}
+
+static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+//  TODO: BP: interrupt remapping
+//    if ( iommu_enabled )
+//        iommu_update_ire_from_msi(entry, msg);
+
+    switch ( entry->msi_attrib.type )
+    {
+    case PCI_CAP_ID_MSI:
+    {
+        struct pci_dev *dev = entry->dev;
+        int pos = entry->msi_attrib.pos;
+        u8 bus = dev->bus;
+        u8 slot = PCI_SLOT(dev->devfn);
+        u8 func = PCI_FUNC(dev->devfn);
+
+        pci_conf_write32(bus, slot, func, msi_lower_address_reg(pos),
+                         msg->address_lo);
+        if ( entry->msi_attrib.is_64 )
+        {
+            pci_conf_write32(bus, slot, func, msi_upper_address_reg(pos),
+                             msg->address_hi);
+            pci_conf_write16(bus, slot, func, msi_data_reg(pos, 1),
+                             msg->data);
+        }
+        else
+            pci_conf_write16(bus, slot, func, msi_data_reg(pos, 0),
+                             msg->data);
+        break;
+    }
+    case PCI_CAP_ID_MSIX:
+    {
+        void __iomem *base;
+        base = entry->mask_base +
+            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+        writel(msg->address_lo,
+               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+        writel(msg->address_hi,
+               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+        writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+        break;
+    }
+    default:
+        BUG();
+    }
+    entry->msg = *msg;
+}
+
+void set_msi_affinity(unsigned int vector, cpumask_t mask)
+{
+    struct msi_desc *desc = irq_desc[vector].msi_desc;
+    struct msi_msg msg;
+    unsigned int dest;
+
+    memset(&msg, 0, sizeof(msg));
+    cpus_and(mask, mask, cpu_online_map);
+    if ( cpus_empty(mask) )
+        mask = TARGET_CPUS;
+    dest = cpu_mask_to_apicid(mask);
+
+    if ( !desc )
+        return;
+
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+    read_msi_msg(desc, &msg);
+
+    msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+    msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+    write_msi_msg(desc, &msg);
+}
+
+static void msi_set_enable(struct pci_dev *dev, int enable)
+{
+    int pos;
+    u16 control;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
+    if ( pos )
+    {
+        control = pci_conf_read16(bus, slot, func, pos + PCI_MSI_FLAGS);
+        control &= ~PCI_MSI_FLAGS_ENABLE;
+        if ( enable )
+            control |= PCI_MSI_FLAGS_ENABLE;
+        pci_conf_write16(bus, slot, func, pos + PCI_MSI_FLAGS, control);
+    }
+}
+
+static void msix_set_enable(struct pci_dev *dev, int enable)
+{
+    int pos;
+    u16 control;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( pos )
+    {
+        control = pci_conf_read16(bus, slot, func, pos + PCI_MSIX_FLAGS);
+        control &= ~PCI_MSIX_FLAGS_ENABLE;
+        if ( enable )
+            control |= PCI_MSIX_FLAGS_ENABLE;
+        pci_conf_write16(bus, slot, func, pos + PCI_MSIX_FLAGS, control);
+    }
+}
+
+static void msix_flush_writes(unsigned int vector)
+{
+    struct msi_desc *entry = irq_desc[vector].msi_desc;
+
+    BUG_ON(!entry || !entry->dev);
+    switch (entry->msi_attrib.type) {
+    case PCI_CAP_ID_MSI:
+        /* nothing to do */
+        break;
+    case PCI_CAP_ID_MSIX:
+    {
+        int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+            PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+        readl(entry->mask_base + offset);
+        break;
+    }
+    default:
+        BUG();
+        break;
+    }
+}
+
+int msi_maskable_irq(const struct msi_desc *entry)
+{
+    BUG_ON(!entry);
+    return entry->msi_attrib.type != PCI_CAP_ID_MSI
+           || entry->msi_attrib.maskbit;
+}
+
+static void msi_set_mask_bit(unsigned int vector, int flag)
+{
+    struct msi_desc *entry = irq_desc[vector].msi_desc;
+
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+    BUG_ON(!entry || !entry->dev);
+    switch (entry->msi_attrib.type) {
+    case PCI_CAP_ID_MSI:
+        if (entry->msi_attrib.maskbit) {
+            int pos;
+            u32 mask_bits;
+            u8 bus = entry->dev->bus;
+            u8 slot = PCI_SLOT(entry->dev->devfn);
+            u8 func = PCI_FUNC(entry->dev->devfn);
+
+            pos = (long)entry->mask_base;
+            mask_bits = pci_conf_read32(bus, slot, func, pos);
+            mask_bits &= ~(1);
+            mask_bits |= flag;
+            pci_conf_write32(bus, slot, func, pos, mask_bits);
+        }
+        break;
+    case PCI_CAP_ID_MSIX:
+    {
+        int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+            PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+        writel(flag, entry->mask_base + offset);
+        readl(entry->mask_base + offset);
+        break;
+    }
+    default:
+        BUG();
+        break;
+    }
+    entry->msi_attrib.masked = !!flag;
+}
+
+void mask_msi_vector(unsigned int vector)
+{
+    msi_set_mask_bit(vector, 1);
+    msix_flush_writes(vector);
+}
+
+void unmask_msi_vector(unsigned int vector)
+{
+    msi_set_mask_bit(vector, 0);
+    msix_flush_writes(vector);
+}
+
+static struct msi_desc* alloc_msi_entry(void)
+{
+    struct msi_desc *entry;
+
+    entry = xmalloc(struct msi_desc);
+    if ( !entry )
+        return NULL;
+
+    INIT_LIST_HEAD(&entry->list);
+    entry->dev = NULL;
+    entry->remap_index = -1;
+
+    return entry;
+}
+
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+    struct msi_msg msg;
+
+    msi_compose_msg(dev, desc->vector, &msg);
+    set_vector_msi(desc);
+    write_msi_msg(irq_desc[desc->vector].msi_desc, &msg);
+
+    return 0;
+}
+
+void teardown_msi_vector(int vector)
+{
+    unset_vector_msi(vector);
+}
+
+int msi_free_vector(struct msi_desc *entry)
+{
+    if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+    {
+        unsigned long start;
+
+        writel(1, entry->mask_base + entry->msi_attrib.entry_nr
+               * PCI_MSIX_ENTRY_SIZE
+               + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+        start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
+        msix_fixmap_free(virt_to_fix(start));
+        destroy_xen_mappings(start, start + PAGE_SIZE);
+    }
+    list_del(&entry->list);
+    xfree(entry);
+    return 0;
+}
+
+static struct msi_desc *find_msi_entry(struct pci_dev *dev,
+                                       int vector, int cap_id)
+{
+    struct msi_desc *entry;
+
+    list_for_each_entry( entry, &dev->msi_list, list )
+    {
+        if ( entry->msi_attrib.type == cap_id &&
+             (vector == -1 || entry->vector == vector) )
+            return entry;
+    }
+
+    return NULL;
+}
+
+/**
+ * msi_capability_init - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ *
+ * Setup the MSI capability structure of device function with a single
+ * MSI irq, regardless of device function is capable of handling
+ * multiple messages. A return of zero indicates the successful setup
+ * of an entry zero with the new MSI irq or non-zero for otherwise.
+ **/
+static int msi_capability_init(struct pci_dev *dev,
+                               int vector,
+                               struct msi_desc **desc)
+{
+    struct msi_desc *entry;
+    int pos;
+    u16 control;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
+    control = pci_conf_read16(bus, slot, func, msi_control_reg(pos));
+    /* MSI Entry Initialization */
+    msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */
+
+    entry = alloc_msi_entry();
+    if ( !entry )
+        return -ENOMEM;
+
+    entry->msi_attrib.type = PCI_CAP_ID_MSI;
+    entry->msi_attrib.is_64 = is_64bit_address(control);
+    entry->msi_attrib.entry_nr = 0;
+    entry->msi_attrib.maskbit = is_mask_bit_support(control);
+    entry->msi_attrib.masked = 1;
+    entry->msi_attrib.pos = pos;
+    entry->vector = vector;
+    if ( is_mask_bit_support(control) )
+        entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
+                                                                   is_64bit_address(control));
+    entry->dev = dev;
+    if ( entry->msi_attrib.maskbit )
+    {
+        unsigned int maskbits, temp;
+        /* All MSIs are unmasked by default, Mask them all */
+        maskbits = pci_conf_read32(bus, slot, func,
+                                   msi_mask_bits_reg(pos, is_64bit_address(control)));
+        temp = (1 << multi_msi_capable(control));
+        temp = ((temp - 1) & ~temp);
+        maskbits |= temp;
+        pci_conf_write32(bus, slot, func,
+                         msi_mask_bits_reg(pos, is_64bit_address(control)),
+                         maskbits);
+    }
+    list_add_tail(&entry->list, &dev->msi_list);
+
+    *desc = entry;
+    /* Restore the original MSI enabled bits  */
+    pci_conf_write16(bus, slot, func, msi_control_reg(pos), control);
+
+    return 0;
+}
+
+/**
+ * msix_capability_init - configure device's MSI-X capability
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of struct msix_entry entries
+ * @nvec: number of @entries
+ *
+ * Setup the MSI-X capability structure of device function with a
+ * single MSI-X irq. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ **/
+static int msix_capability_init(struct pci_dev *dev,
+                                struct msi_info *msi,
+                                struct msi_desc **desc)
+{
+    struct msi_desc *entry;
+    int pos;
+    u16 control;
+    unsigned long phys_addr;
+    u32 table_offset;
+    u8 bir;
+    void __iomem *base;
+    int idx;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(desc);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+    msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+
+    /* MSI-X Table Initialization */
+    entry = alloc_msi_entry();
+    if ( !entry )
+        return -ENOMEM;
+
+    /* Request & Map MSI-X table region */
+    table_offset = pci_conf_read32(bus, slot, func, msix_table_offset_reg(pos));
+    bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+    table_offset &= ~PCI_MSIX_FLAGS_BIRMASK;
+    phys_addr = msi->table_base + table_offset;
+    idx = msix_fixmap_alloc();
+    if ( idx < 0 )
+    {
+        xfree(entry);
+        return -ENOMEM;
+    }
+    set_fixmap_nocache(idx, phys_addr);
+    base = (void *)(fix_to_virt(idx) + (phys_addr & ((1UL << PAGE_SHIFT) - 1)));
+
+    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+    entry->msi_attrib.is_64 = 1;
+    entry->msi_attrib.entry_nr = msi->entry_nr;
+    entry->msi_attrib.maskbit = 1;
+    entry->msi_attrib.masked = 1;
+    entry->msi_attrib.pos = pos;
+    entry->vector = msi->vector;
+    entry->dev = dev;
+    entry->mask_base = base;
+
+    list_add_tail(&entry->list, &dev->msi_list);
+
+    /* Mask interrupt here */
+    writel(1, entry->mask_base + entry->msi_attrib.entry_nr
+                * PCI_MSIX_ENTRY_SIZE
+                + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+    *desc = entry;
+    /* Restore MSI-X enabled bits */
+    pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
+
+    return 0;
+}
+
+/**
+ * pci_enable_msi - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ *
+ * Setup the MSI capability structure of device function with
+ * a single MSI irq upon its software driver call to request for
+ * MSI mode enabled on its hardware device function. A return of zero
+ * indicates the successful setup of an entry zero with the new MSI
+ * irq or non-zero for otherwise.
+ **/
+static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
+{
+    int status;
+    struct pci_dev *pdev;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(msi->bus, msi->devfn);
+    if ( !pdev )
+        return -ENODEV;
+
+    if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
+    {
+        dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        return 0;
+    }
+
+    status = msi_capability_init(pdev, msi->vector, desc);
+    return status;
+}
+
+static void __pci_disable_msi(struct msi_desc *entry)
+{
+    struct pci_dev *dev;
+    int pos;
+    u16 control;
+    u8 bus, slot, func;
+
+    dev = entry->dev;
+    bus = dev->bus;
+    slot = PCI_SLOT(dev->devfn);
+    func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
+    control = pci_conf_read16(bus, slot, func, msi_control_reg(pos));
+    msi_set_enable(dev, 0);
+
+    BUG_ON(list_empty(&dev->msi_list));
+
+}
+
+/**
+ * pci_enable_msix - configure device's MSI-X capability structure
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of MSI-X entries
+ * @nvec: number of MSI-X irqs requested for allocation by device driver
+ *
+ * Setup the MSI-X capability structure of device function with the number
+ * of requested irqs upon its software driver call to request for
+ * MSI-X mode enabled on its hardware device function. A return of zero
+ * indicates the successful configuration of MSI-X capability structure
+ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
+ * Or a return of > 0 indicates that driver request is exceeding the number
+ * of irqs available. Driver should use the returned value to re-send
+ * its request.
+ **/
+static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
+{
+    int status, pos, nr_entries;
+    struct pci_dev *pdev;
+    u16 control;
+    u8 slot = PCI_SLOT(msi->devfn);
+    u8 func = PCI_FUNC(msi->devfn);
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(msi->bus, msi->devfn);
+    if ( !pdev )
+        return -ENODEV;
+
+    pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
+    control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
+    nr_entries = multi_msix_capable(control);
+    if (msi->entry_nr >= nr_entries)
+        return -EINVAL;
+
+    if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
+    {
+        dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        return 0;
+    }
+
+    status = msix_capability_init(pdev, msi, desc);
+    return status;
+}
+
+static void __pci_disable_msix(struct msi_desc *entry)
+{
+    struct pci_dev *dev;
+    int pos;
+    u16 control;
+    u8 bus, slot, func;
+
+    dev = entry->dev;
+    bus = dev->bus;
+    slot = PCI_SLOT(dev->devfn);
+    func = PCI_FUNC(dev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+    msi_set_enable(dev, 0);
+
+    BUG_ON(list_empty(&dev->msi_list));
+
+    writel(1, entry->mask_base + entry->msi_attrib.entry_nr
+      * PCI_MSIX_ENTRY_SIZE
+      + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+    pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
+}
+
+/*
+ * Notice: only construct the msi_desc
+ * no change to irq_desc here, and the interrupt is masked
+ */
+int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
+{
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    return  msi->table_base ? __pci_enable_msix(msi, desc) :
+        __pci_enable_msi(msi, desc);
+}
+
+/*
+ * Device only, no irq_desc
+ */
+void pci_disable_msi(struct msi_desc *msi_desc)
+{
+    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+        __pci_disable_msi(msi_desc);
+    else if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSIX )
+        __pci_disable_msix(msi_desc);
+}
+
+static void msi_free_vectors(struct pci_dev* dev)
+{
+    struct msi_desc *entry, *tmp;
+    irq_desc_t *desc;
+    unsigned long flags, vector;
+
+    list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
+    {
+        vector = entry->vector;
+        desc = &irq_desc[vector];
+        pci_disable_msi(entry);
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        teardown_msi_vector(vector);
+
+        if ( desc->handler == &pci_msi_type )
+        {
+            /* MSI is not shared, so should be released already */
+            BUG_ON(desc->status & IRQ_GUEST);
+            desc->handler = &no_irq_type;
+        }
+
+        spin_unlock_irqrestore(&desc->lock, flags);
+        msi_free_vector(entry);
+    }
+}
+
+void pci_cleanup_msi(struct pci_dev *pdev)
+{
+    /* Disable MSI and/or MSI-X */
+    msi_set_enable(pdev, 0);
+    msix_set_enable(pdev, 0);
+    msi_free_vectors(pdev);
+}
+
+int pci_restore_msi_state(struct pci_dev *pdev)
+{
+    unsigned long flags;
+    int vector;
+    struct msi_desc *entry, *tmp;
+    irq_desc_t *desc;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    if (!pdev)
+        return -EINVAL;
+
+    list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
+    {
+        vector = entry->vector;
+        desc = &irq_desc[vector];
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        ASSERT(desc->msi_desc == entry);
+
+        if (desc->msi_desc != entry)
+        {
+            dprintk(XENLOG_ERR, "Restore MSI for dev %x:%x not set before?\n",
+                                pdev->bus, pdev->devfn);
+            spin_unlock_irqrestore(&desc->lock, flags);
+            return -EINVAL;
+        }
+
+        msi_set_enable(pdev, 0);
+        write_msi_msg(entry, &entry->msg);
+
+        msi_set_enable(pdev, 1);
+        msi_set_mask_bit(vector, entry->msi_attrib.masked);
+        spin_unlock_irqrestore(&desc->lock, flags);
+    }
+
+    return 0;
+}
+
diff --git a/arch/x86/physdev.c b/arch/x86/physdev.c
index acbd214..a62cfdf 100644
--- a/arch/x86/physdev.c
+++ b/arch/x86/physdev.c
@@ -189,8 +189,15 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         if ( (irq < 0) || (irq >= NR_IRQS) )
             break;
         irq_status_query.flags = 0;
-        if ( pirq_acktype(v->domain, irq) != 0 )
-            irq_status_query.flags |= XENIRQSTAT_needs_eoi;
+        /*
+         * Even edge-triggered or message-based IRQs can need masking from
+         * time to time. If teh guest is not dynamically checking for this
+         * via the new pirq_eoi_map mechanism, it must conservatively always
+         * execute the EOI hypercall. In practice, this only really makes a
+         * difference for maskable MSI sources, and if those are supported
+         * then dom0 is probably modern anyway.
+         */
+        irq_status_query.flags |= XENIRQSTAT_needs_eoi;
         if ( pirq_shared(v->domain, irq) )
             irq_status_query.flags |= XENIRQSTAT_shared;
         ret = copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0;
diff --git a/drivers/passthrough/pci.c b/drivers/passthrough/pci.c
index 86173e3..c52b7e2 100644
--- a/drivers/passthrough/pci.c
+++ b/drivers/passthrough/pci.c
@@ -46,6 +46,7 @@ struct pci_dev *alloc_pdev(u8 bus, u8 devfn)
     *((u8*) &pdev->bus) = bus;
     *((u8*) &pdev->devfn) = devfn;
     pdev->domain = NULL;
+    INIT_LIST_HEAD(&pdev->msi_list);
     list_add(&pdev->alldevs_list, &alldevs_list);
 
     return pdev;
diff --git a/include/asm-x86/fixmap.h b/include/asm-x86/fixmap.h
index b6408da..c14d8f7 100644
--- a/include/asm-x86/fixmap.h
+++ b/include/asm-x86/fixmap.h
@@ -18,6 +18,7 @@
 #include <asm/page.h>
 #include <xen/kexec.h>
 #include <xen/iommu.h>
+#include <asm/msi.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -43,6 +44,8 @@ enum fixed_addresses {
       + ((KEXEC_XEN_NO_PAGES >> 1) * KEXEC_IMAGE_NR) - 1,
     FIX_IOMMU_REGS_BASE_0,
     FIX_IOMMU_REGS_END = FIX_IOMMU_REGS_BASE_0 + MAX_IOMMUS-1,
+    FIX_MSIX_IO_RESERV_BASE,
+    FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + MAX_MSIX_PAGES -1,
     __end_of_fixed_addresses
 };
 
diff --git a/include/asm-x86/irq.h b/include/asm-x86/irq.h
index 2c4f84d..ac90bbc 100644
--- a/include/asm-x86/irq.h
+++ b/include/asm-x86/irq.h
@@ -20,6 +20,8 @@
 extern int vector_irq[NR_VECTORS];
 extern u8 irq_vector[NR_IRQ_VECTORS];
 #define AUTO_ASSIGN             -1
+#define NEVER_ASSIGN            -2
+#define FREE_TO_ASSIGN          -3
 
 #define platform_legacy_irq(irq)	((irq) < 16)
 
diff --git a/include/asm-x86/msi.h b/include/asm-x86/msi.h
new file mode 100644
index 0000000..1f18b7c
--- /dev/null
+++ b/include/asm-x86/msi.h
@@ -0,0 +1,225 @@
+#ifndef __ASM_MSI_H
+#define __ASM_MSI_H
+
+#include <xen/cpumask.h>
+#include <asm/irq.h>
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT		0
+#define  MSI_DATA_VECTOR_MASK		0x000000ff
+#define	 MSI_DATA_VECTOR(v)		(((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT	8
+#define  MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define  MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT		14
+#define	 MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
+#define	 MSI_DATA_LEVEL_ASSERT		(1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT		15
+#define  MSI_DATA_TRIGGER_EDGE		(0 << MSI_DATA_TRIGGER_SHIFT)
+#define  MSI_DATA_TRIGGER_LEVEL		(1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI	    	0
+#define MSI_ADDR_BASE_LO	    	0xfee00000
+#define MSI_ADDR_HEADER             MSI_ADDR_BASE_LO
+
+#define MSI_ADDR_DESTMODE_SHIFT     2
+#define MSI_ADDR_DESTMODE_PHYS      (0 << MSI_ADDR_DESTMODE_SHIFT)
+#define MSI_ADDR_DESTMODE_LOGIC     (1 << MSI_ADDR_DESTMODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT  3
+#define MSI_ADDR_REDIRECTION_CPU    (0 << MSI_ADDR_REDIRECTION_SHIFT)
+#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+
+#define MSI_ADDR_DEST_ID_SHIFT		12
+#define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
+#define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK)
+
+/* MAX fixed pages reserved for mapping MSIX tables. */
+#if defined(__x86_64__)
+#define MAX_MSIX_PAGES              512
+#else
+#define MAX_MSIX_PAGES              32
+#endif
+
+struct msi_info {
+    int bus;
+    int devfn;
+    int vector;
+    int entry_nr;
+    uint64_t table_base;
+};
+
+struct msi_msg {
+	u32	address_lo;	/* low 32 bits of msi message address */
+	u32	address_hi;	/* high 32 bits of msi message address */
+	u32	data;		/* 16 bits of msi message data */
+};
+
+struct msi_desc;
+/* Helper functions */
+extern void mask_msi_vector(unsigned int vector);
+extern void unmask_msi_vector(unsigned int vector);
+extern void set_msi_affinity(unsigned int vector, cpumask_t mask);
+extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
+extern void pci_disable_msi(struct msi_desc *desc);
+extern void pci_cleanup_msi(struct pci_dev *pdev);
+extern int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
+extern void teardown_msi_vector(int vector);
+extern int msi_free_vector(struct msi_desc *entry);
+extern int pci_restore_msi_state(struct pci_dev *pdev);
+
+struct msi_desc {
+	struct {
+		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
+		__u8	masked	: 1;
+		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
+		__u8	pos;	 	/* Location of the msi capability */
+		__u16	entry_nr;    	/* specific enabled entry 	  */
+	}msi_attrib;
+
+	struct list_head list;
+
+	void __iomem *mask_base;
+	struct pci_dev *dev;
+	int vector;
+
+	struct msi_msg msg;		/* Last set MSI message */
+
+	int remap_index;		/* index in interrupt remapping table */
+};
+
+int msi_maskable_irq(const struct msi_desc *);
+
+/*
+ * Assume the maximum number of hot plug slots supported by the system is about
+ * ten. The worstcase is that each of these slots is hot-added with a device,
+ * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which
+ * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined
+ * as below to ensure at least one message is assigned to each detected MSI/
+ * MSI-X device function.
+ */
+#define NR_HP_RESERVED_VECTORS 	20
+
+extern struct hw_interrupt_type pci_msi_type;
+
+/*
+ * MSI-X Address Register
+ */
+#define PCI_MSIX_FLAGS_QSIZE		0x7FF
+#define PCI_MSIX_FLAGS_ENABLE		(1 << 15)
+#define PCI_MSIX_FLAGS_BIRMASK		(7 << 0)
+#define PCI_MSIX_FLAGS_BITMASK		(1 << 0)
+
+#define PCI_MSIX_ENTRY_SIZE			16
+#define  PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET	0
+#define  PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET	4
+#define  PCI_MSIX_ENTRY_DATA_OFFSET		8
+#define  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET	12
+
+#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
+#define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
+#define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
+#define msi_data_reg(base, is64bit)	\
+	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
+#define msi_mask_bits_reg(base, is64bit) \
+	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
+#define multi_msi_capable(control) \
+	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
+#define multi_msi_enable(control, num) \
+	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+#define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
+#define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
+#define msi_enable(control, num) multi_msi_enable(control, num); \
+	control |= PCI_MSI_FLAGS_ENABLE
+
+#define msix_control_reg(base)		(base + PCI_MSIX_FLAGS)
+#define msix_table_offset_reg(base)	(base + 0x04)
+#define msix_pba_offset_reg(base)	(base + 0x08)
+#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
+#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
+#define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
+#define multi_msix_capable		msix_table_size
+#define msix_unmask(address)	 	(address & ~PCI_MSIX_FLAGS_BITMASK)
+#define msix_mask(address)		(address | PCI_MSIX_FLAGS_BITMASK)
+#define msix_is_pending(address) 	(address & PCI_MSIX_FLAGS_PENDMASK)
+
+/*
+ * MSI Defined Data Structures
+ */
+#define MSI_ADDRESS_HEADER		0xfee
+#define MSI_ADDRESS_HEADER_SHIFT	12
+#define MSI_ADDRESS_HEADER_MASK		0xfff000
+#define MSI_ADDRESS_DEST_ID_MASK	0xfff0000f
+#define MSI_TARGET_CPU_MASK		0xff
+#define MSI_TARGET_CPU_SHIFT		12
+#define MSI_DELIVERY_MODE		0
+#define MSI_LEVEL_MODE			1	/* Edge always assert */
+#define MSI_TRIGGER_MODE		0	/* MSI is edge sensitive */
+#define MSI_PHYSICAL_MODE		0
+#define MSI_LOGICAL_MODE		1
+#define MSI_REDIRECTION_HINT_MODE	0
+
+#define __LITTLE_ENDIAN_BITFIELD	1
+
+struct msg_data {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32	vector		:  8;
+	__u32	delivery_mode	:  3;	/* 000b: FIXED | 001b: lowest prior */
+	__u32	reserved_1	:  3;
+	__u32	level		:  1;	/* 0: deassert | 1: assert */
+	__u32	trigger		:  1;	/* 0: edge | 1: level */
+	__u32	reserved_2	: 16;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u32	reserved_2	: 16;
+	__u32	trigger		:  1;	/* 0: edge | 1: level */
+	__u32	level		:  1;	/* 0: deassert | 1: assert */
+	__u32	reserved_1	:  3;
+	__u32	delivery_mode	:  3;	/* 000b: FIXED | 001b: lowest prior */
+	__u32	vector		:  8;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+} __attribute__ ((packed));
+
+struct msg_address {
+	union {
+		struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+			__u32	reserved_1	:  2;
+			__u32	dest_mode	:  1;	/*0:physic | 1:logic */
+			__u32	redirection_hint:  1;  	/*0: dedicated CPU
+							  1: lowest priority */
+			__u32	reserved_2	:  4;
+ 			__u32	dest_id		: 24;	/* Destination ID */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ 			__u32	dest_id		: 24;	/* Destination ID */
+			__u32	reserved_2	:  4;
+			__u32	redirection_hint:  1;  	/*0: dedicated CPU
+							  1: lowest priority */
+			__u32	dest_mode	:  1;	/*0:physic | 1:logic */
+			__u32	reserved_1	:  2;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+      		}u;
+       		__u32  value;
+	}lo_address;
+	__u32 	hi_address;
+} __attribute__ ((packed));
+
+#endif /* __ASM_MSI_H */
diff --git a/include/xen/irq.h b/include/xen/irq.h
index 71c4484..81677e0 100644
--- a/include/xen/irq.h
+++ b/include/xen/irq.h
@@ -44,6 +44,7 @@ typedef struct hw_interrupt_type hw_irq_controller;
 
 #include <asm/irq.h>
 
+struct msi_desc;
 /*
  * This is the "IRQ descriptor", which contains various information
  * about the irq, including what kind of hardware handling it has,
@@ -54,9 +55,11 @@ typedef struct hw_interrupt_type hw_irq_controller;
 typedef struct {
     unsigned int status;		/* IRQ status */
     hw_irq_controller *handler;
+    struct msi_desc   *msi_desc;
     struct irqaction *action;	/* IRQ action list */
     unsigned int depth;		/* nested irq disables */
     spinlock_t lock;
+    cpumask_t affinity;
 } __cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc[NR_IRQS];
@@ -80,4 +83,16 @@ extern void pirq_guest_unbind(struct domain *d, int irq);
 extern irq_desc_t *domain_spin_lock_irq_desc(
 		    struct domain *d, int irq, unsigned long *pflags);
 
+static inline void set_native_irq_info(unsigned int vector, cpumask_t mask)
+{
+	    irq_desc[vector].affinity = mask;
+}
+
+#ifdef irq_to_vector
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	    set_native_irq_info(irq_to_vector(irq), mask);
+}
+#endif
+
 #endif /* __XEN_IRQ_H__ */
diff --git a/include/xen/pci.h b/include/xen/pci.h
index 618c6bf..e7c98c5 100644
--- a/include/xen/pci.h
+++ b/include/xen/pci.h
@@ -32,6 +32,7 @@
 struct pci_dev {
     struct list_head alldevs_list;
     struct list_head domain_list;
+    struct list_head msi_list;
     struct domain *domain;
     const u8 bus;
     const u8 devfn;