Sophie: kernel-2.6.18-194.11.1.el5 src

kernel-2.6.18-194.11.1.el5.src.rpm

From: Justin M. Forbes <jforbes@redhat.com>
Date: Thu, 7 May 2009 16:36:54 -0500
Subject: [xen] add Credit Scheduler Fairness and hard virt
Message-id: 20090507213654.GB22541@redhat.com
O-Subject: Re: [RHEL5.4 PATCH] Add Credit Scheduler Fairness and hard virt to xen hypervisor
Bugzilla: 432700
RH-Acked-by: Chris Lalancette <clalance@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Don Dutile <ddutile@redhat.com>

BZ#432700
https://bugzilla.redhat.com/show_bug.cgi?id=432700

Description:
Add credit scheduler fairness to more fairly allocate resources across
domains.  With the "hardvirt" hypervisor option we can also give dom0 a
priority boost, allowing for lower I/O latency.  By default the dom0
priority boost is turned off because it could impact customers who choose
to run real workloads on their dom0, but the option is critical for Amazon.
This allows hard-virt support without major impact to existing deployments.

Upstream Status:
This patch is currently being forward ported for upstream submission, but
is criti cal to our customer for RHEL 5.4

Testing:
Extensive testing has been done with the hardvirt option disabled (the
default case) and with it enabled.  To ensure that the hard-virt and dom0
priority boost do not impact existing deployments.  A follow up will
include more history and details on the benchmarking.
--
Justin M. Forbes

[-- Attachment #2: xen-hardvirt-csched-fairness.patch --]
[-- Type: text/plain, Encoding: 7bit, Size: 44K --]

commit 1a4c015521491b393b8e826178ab19bbcca8f4b3
Author: Justin M. Forbes <jforbes@redhat.com>
Date:   Thu Apr 23 00:02:03 2009 -0500

    Introduce hard virt and scheduler fairness. By default this makes
    the credit scheduler more fair in scheduling all domains.  Passing the
    hypervisor boot option "hardvirt" will enable dom0 to get a priority
    boost and allow additional vcpus to be given a hard virt priority boost.

    Signed-off-by: Justin M. Forbes <jforbes@redhat.com>

Attached is the updated patch.  To keep the changes minimal and manageable
it simply removes most of the excessive comments, cleans up a messy
declaration, removes the unlikely which were not too unlikely, and moves
the check for opt_hardvirt to the beginning of chains where we check for
it.  Performance email will follow.

--
Justin M. Forbes

commit 178aa9a965faa1f85959b1841b57208913c4083a
Author: Justin M. Forbes <jforbes@redhat.com>
Date:   Thu May 7 16:32:40 2009 -0500

    Introduce hard virt and scheduler fairness. By default this makes
    the credit scheduler more fair in scheduling all domains.  Passing the
    hypervisor boot option "hardvirt" will enable dom0 to get a priority
    boost and allow additional vcpus to be given a hard virt priority boost.

    Signed-off-by: Justin M. Forbes <jforbes@redhat.com>

diff --git a/common/domain.c b/common/domain.c
index 33e464b..9fa0b83 100644
--- a/common/domain.c
+++ b/common/domain.c
@@ -81,6 +81,8 @@ struct domain *alloc_domain(domid_t domid)
     spin_lock_init(&d->shutdown_lock);
     INIT_LIST_HEAD(&d->page_list);
     INIT_LIST_HEAD(&d->xenpage_list);
+    /* HV */
+    atomic_set(&d->hard_virt, 0);
 
     return d;
 }
diff --git a/common/sched_credit.c b/common/sched_credit.c
index 192ed84..1224abd 100644
--- a/common/sched_credit.c
+++ b/common/sched_credit.c
@@ -6,7 +6,8 @@
  *      Author: Emmanuel Ackaouy
  *
  * Description: Credit-based SMP CPU scheduler
- */
+ *
+*/
 
 #include <xen/config.h>
 #include <xen/init.h>
@@ -48,14 +49,24 @@
 #define CSCHED_CREDITS_PER_ACCT     \
     (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
 
+/* opt_hardvirt: This enables the both the dom0 bypass and
+ * hard virt dom0.  By default these are disabled so as to 
+ * keep behavior as expected for workloads running on an
+ * existing dom0.
+ */
+static int opt_hardvirt = 0;
+boolean_param("hardvirt", opt_hardvirt);
+
 
 /*
  * Priorities
  */
-#define CSCHED_PRI_TS_BOOST      0      /* time-share waking up */
 #define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
 #define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
 #define CSCHED_PRI_IDLE         -64     /* idle */
+#define CSCHED_PRI_RR           10      /* Dom-0 and Hard-Virts - HV*/
+
+#define NUMBER_DOM0_VCPUS_PRESENT(_cpu) (CSCHED_PCPU(_cpu)->number_of_dom0_vcpus_present)
 
 
 /*
@@ -123,7 +134,12 @@
     _MACRO(dom_init)                        \
     _MACRO(dom_destroy)                     \
     _MACRO(vcpu_init)                       \
-    _MACRO(vcpu_destroy)
+    _MACRO(vcpu_destroy)                    \
+    _MACRO(tickle_hard_virt_none)                       \
+    _MACRO(rt_imbalance)                                \
+    _MACRO(rt_vcpu_migrate)                             \
+    _MACRO(rt_steal_trylock_failed)
+
 
 #ifndef NDEBUG
 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
@@ -188,6 +204,8 @@ struct csched_pcpu {
     uint32_t runq_sort_last;
     struct timer ticker;
     unsigned int tick;
+    uint16_t number_of_dom0_vcpus_present;
+    uint16_t unused; /* HV */
 };
 
 /*
@@ -201,6 +219,11 @@ struct csched_vcpu {
     atomic_t credit;
     uint16_t flags;
     int16_t pri;
+    int credit_real_incr;
+    atomic_t hard_virt_pcpu;  /* HV */
+    uint16_t hard_virt_pcpu_state_change; /* HV */
+    uint16_t unused;
+
 #ifdef CSCHED_STATS
     struct {
         int credit_last;
@@ -239,6 +262,9 @@ struct csched_private {
     int credit_balance;
     uint32_t runq_sort;
     CSCHED_STATS_DEFINE()
+    spinlock_t hard_virt_lock; /* HV */
+    cpumask_t hard_virt_none; /* 1 by default - meaning it has no RT vcpu */
+    cpumask_t hard_virt_multiple; /* 0 by default - meaning it has no more than 1 RT vcpu */
 };
 
 
@@ -249,6 +275,9 @@ static struct csched_private csched_priv;
 
 static void csched_tick(void *_cpu);
 
+/* HV - Protected by hard_virt_lock */
+static unsigned int total_hard_virts=0;
+
 static inline int
 __cycle_cpu(int cpu, const cpumask_t *mask)
 {
@@ -275,18 +304,100 @@ __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
 {
     const struct list_head * const runq = RUNQ(cpu);
     struct list_head *iter;
+    int credit, new_credit;
+
 
     BUG_ON( __vcpu_on_runq(svc) );
     BUG_ON( cpu != svc->vcpu->processor );
 
+    /* HV - No race condition for hard_virt_pcpu_state_change here */
+    if (svc->hard_virt_pcpu_state_change)
+    {
+       svc->hard_virt_pcpu_state_change = 0;
+       if (atomic_read(&svc->hard_virt_pcpu))
+          svc->pri = CSCHED_PRI_RR;
+       else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0)
+       {
+           if (atomic_read(&svc->credit) > 0)
+               svc->pri = CSCHED_PRI_TS_UNDER;
+           else
+               svc->pri = CSCHED_PRI_TS_OVER;
+       }
+    }
+    if (svc->vcpu->domain->domain_id == 0)
+       NUMBER_DOM0_VCPUS_PRESENT(cpu)++;
+
+    new_credit = atomic_read(&svc->credit);
+
+    if (new_credit >= CSCHED_CREDITS_PER_TSLICE/2)
+    {
+        list_for_each( iter, runq )
+        {
+            const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+            if (svc->pri > iter_svc->pri )
+                break;
+            credit = atomic_read(&iter_svc->credit);
+            if ( svc->pri == iter_svc->pri && credit < (CSCHED_CREDITS_PER_TSLICE/2) )
+                break;
+        }
+                                                                                                                             
+    }
+    else
+    {
+         list_for_each( iter, runq )
+        {
+            const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+            if ( svc->pri > iter_svc->pri )
+                break;
+        }
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+__runq_insert_special(unsigned int cpu, struct csched_vcpu *svc)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+    int new_credit, credit;
+                                                                                                                             
+    BUG_ON( __vcpu_on_runq(svc) );
+    BUG_ON( cpu != svc->vcpu->processor );
+                                                                                                                             
+    /* HV */
+    if (svc->hard_virt_pcpu_state_change)
+    {
+       svc->hard_virt_pcpu_state_change = 0;
+       if (atomic_read(&svc->hard_virt_pcpu))
+          svc->pri = CSCHED_PRI_RR;
+       else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0)
+       {
+           if (atomic_read(&svc->credit) > 0)
+               svc->pri = CSCHED_PRI_TS_UNDER;
+           else
+               svc->pri = CSCHED_PRI_TS_OVER;
+       }
+    }
+    if (svc->vcpu->domain->domain_id == 0)
+       NUMBER_DOM0_VCPUS_PRESENT(cpu)++;
+                                                                                                                             
+                                                                                                                             
+    new_credit = atomic_read(&svc->credit);
+                                                                                                                             
+                                                                                                                             
     list_for_each( iter, runq )
     {
         const struct csched_vcpu * const iter_svc = __runq_elem(iter);
-        if ( svc->pri > iter_svc->pri )
+        if ( svc->pri > iter_svc->pri)
+           break;
+        credit = atomic_read(&iter_svc->credit);
+        if ( (svc->pri == iter_svc->pri && new_credit >= credit))
             break;
     }
-
+                                                                                                                             
     list_add_tail(&svc->runq_elem, iter);
+                                                                                                                             
 }
 
 static inline void
@@ -294,6 +405,25 @@ __runq_remove(struct csched_vcpu *svc)
 {
     BUG_ON( !__vcpu_on_runq(svc) );
     list_del_init(&svc->runq_elem);
+
+    /* HV */
+    if (svc->vcpu->domain->domain_id == 0)
+        NUMBER_DOM0_VCPUS_PRESENT(svc->vcpu->processor)--;
+                                                                                                                             
+    if (svc->hard_virt_pcpu_state_change)
+    {
+       svc->hard_virt_pcpu_state_change = 0;
+       if (atomic_read(&svc->hard_virt_pcpu))
+          svc->pri = CSCHED_PRI_RR;
+       else if (svc->pri == CSCHED_PRI_RR && svc->vcpu->domain->domain_id != 0 )
+       {
+           if (atomic_read(&svc->credit) > 0)
+               svc->pri = CSCHED_PRI_TS_UNDER;
+           else
+               svc->pri = CSCHED_PRI_TS_OVER;
+       }
+    }
+                                                                                                                             
 }
 
 static inline void
@@ -302,12 +432,17 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
     cpumask_t mask;
+    int newcredit, curcredit;
 
     ASSERT(cur);
     cpus_clear(mask);
 
     /* If strictly higher priority than current VCPU, signal the CPU */
-    if ( new->pri > cur->pri )
+    newcredit = atomic_read(&new->credit);
+    curcredit = atomic_read(&cur->credit);
+    /* HV   */
+    if ((opt_hardvirt && new->vcpu->domain->domain_id == 0) || (new->pri > cur->pri ) ||
+          (new->pri == cur->pri && newcredit > curcredit && newcredit > -(CSCHED_CREDITS_PER_TSLICE>>3)) )
     {
         if ( cur->pri == CSCHED_PRI_IDLE )
             CSCHED_STAT_CRANK(tickle_local_idler);
@@ -339,6 +474,18 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
         }
     }
 
+    /* HV - Small chance of false positive in hard_virt_none map here */
+    if ( cur->pri == CSCHED_PRI_RR && new->pri == CSCHED_PRI_RR )
+    {
+       cpu_set(cpu, csched_priv.hard_virt_multiple);
+       if ( ! cpus_empty(csched_priv.hard_virt_none) )
+       {
+           CSCHED_STAT_CRANK(tickle_hard_virt_none);
+           cpus_or(mask, mask, csched_priv.hard_virt_none);
+           cpus_and(mask, mask, new->vcpu->cpu_affinity);
+       }
+    }
+
     /* Send scheduler interrupts to designated CPUs */
     if ( !cpus_empty(mask) )
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
@@ -367,11 +514,14 @@ csched_pcpu_init(int cpu)
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
     INIT_LIST_HEAD(&spc->runq);
     spc->runq_sort_last = csched_priv.runq_sort;
+    spc->number_of_dom0_vcpus_present = 0;
+    spc->unused = 0; /* HV */
     per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
     cpu_set(cpu, csched_priv.idlers);
+    cpu_set(cpu, csched_priv.hard_virt_none); /* HV */
 
     spin_unlock_irqrestore(&csched_priv.lock, flags);
 
@@ -464,6 +614,20 @@ csched_cpu_pick(struct vcpu *vc)
         }
         else
         {
+            /* Hmm.. This is of questionable value.. 
+             * There are many cases where Vcpus are better off
+             * being on the same socket due to effective L2 sharing
+             * and low impact of cache bouncing. 
+             * In the absence of any other workload, moving the Vcpus
+             * to different cores will be useful transiently but when
+             * the system gets busy since there is no mechanism to assert
+             * socket level affinities, it will be a hit on the performance.
+             * NUMA smartness has also gone for a toss here.
+             * 
+             * Eventually we would want to allocate memory for Virts from 
+             * local NUMA nodes in which case NUMA affinities need to 
+             * implemented by the scheduler and this section
+             * needs to be thrown out  */
             ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
             cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
             cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
@@ -533,21 +697,22 @@ csched_vcpu_acct(unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
 
+    int credit;
+    /* Update credits */
+    credit = atomic_read(&svc->credit);
+
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
 
     /*
-     * If this VCPU's priority was boosted when it last awoke, reset it.
-     * If the VCPU is found here, then it's consuming a non-negligeable
-     * amount of CPU resources and should no longer be boosted.
-     */
-    if ( svc->pri == CSCHED_PRI_TS_BOOST )
-        svc->pri = CSCHED_PRI_TS_UNDER;
-
-    /*
      * Update credits
      */
     atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
+                                                                                                                             
+    if ( credit < CSCHED_CREDITS_PER_TICK && svc->pri ==CSCHED_PRI_TS_UNDER )
+    {
+         svc->pri = CSCHED_PRI_TS_OVER;
+    }
 
     /*
      * Put this VCPU and domain back on the active list if it was
@@ -594,6 +759,14 @@ csched_vcpu_init(struct vcpu *vc)
     CSCHED_VCPU_STATS_RESET(svc);
     vc->sched_priv = svc;
 
+    /* HV */
+    if (opt_hardvirt && vc->domain->domain_id == 0 && !is_idle_vcpu(vc))
+	svc->pri = CSCHED_PRI_RR;        
+    svc->credit_real_incr = 0;
+    atomic_set(&svc->hard_virt_pcpu, 0); /* HV */
+    svc->hard_virt_pcpu_state_change = 0;
+    svc->unused = 0;
+
     /* Allocate per-PCPU info */
     if ( unlikely(!CSCHED_PCPU(vc->processor)) )
     {
@@ -617,6 +790,16 @@ csched_vcpu_destroy(struct vcpu *vc)
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
+    /* HV */
+    spin_lock(&csched_priv.hard_virt_lock);
+    if (atomic_read(&svc->hard_virt_pcpu))
+    {
+         atomic_set(&svc->hard_virt_pcpu, 0);
+         svc->hard_virt_pcpu_state_change=1;
+         total_hard_virts--;
+    }
+    spin_unlock(&csched_priv.hard_virt_lock);
+
     spin_lock_irqsave(&csched_priv.lock, flags);
 
     if ( !list_empty(&svc->active_vcpu_elem) )
@@ -666,37 +849,32 @@ csched_vcpu_wake(struct vcpu *vc)
     else
         CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
 
-    /*
-     * We temporarly boost the priority of awaking VCPUs!
-     *
-     * If this VCPU consumes a non negligeable amount of CPU, it
-     * will eventually find itself in the credit accounting code
-     * path where its priority will be reset to normal.
-     *
-     * If on the other hand the VCPU consumes little CPU and is
-     * blocking and awoken a lot (doing I/O for example), its
-     * priority will remain boosted, optimizing it's wake-to-run
-     * latencies.
-     *
-     * This allows wake-to-run latency sensitive VCPUs to preempt
-     * more CPU resource intensive VCPUs without impacting overall 
-     * system fairness.
-     *
-     * The one exception is for VCPUs of capped domains unpausing
-     * after earning credits they had overspent. We don't boost
-     * those.
-     */
-    if ( svc->pri == CSCHED_PRI_TS_UNDER &&
-         !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
-    {
-        svc->pri = CSCHED_PRI_TS_BOOST;
-    }
-
     /* Put the VCPU on the runq and tickle CPUs */
-    __runq_insert(cpu, svc);
+    __runq_insert_special(cpu, svc);
     __runq_tickle(cpu, svc);
 }
 
+/* HV - Count up all vcpus including offline ones */
+static unsigned int find_vcpu_count(struct domain *d)
+{
+    struct vcpu *v;
+    unsigned int vcpu_count=0;
+    for_each_vcpu(d, v)
+        vcpu_count++;
+    return vcpu_count;
+}
+                                                                                                                             
+/* HV - Only online pcpus are considered as valid HV target */
+static unsigned int find_available_online_cpus(unsigned int max_cpus)
+{
+    int cpu;
+    unsigned int pcpu_count=0;
+                                                                                                                             
+    for_each_online_cpu ( cpu )
+       pcpu_count++;
+    return pcpu_count - total_hard_virts;
+}
+
 static int
 csched_dom_cntl(
     struct domain *d,
@@ -705,14 +883,98 @@ csched_dom_cntl(
     struct csched_dom * const sdom = CSCHED_DOM(d);
     unsigned long flags;
 
+    /* HV */
+    unsigned short hard_virt, vcpu;
+    unsigned int vcpus_in_domain, hard_cpus_available;
+
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
     {
-        op->u.credit.weight = sdom->weight;
+        /* HV */
+        op->u.credit.weight = sdom->weight + (atomic_read(&d->hard_virt) << 15) ;
         op->u.credit.cap = sdom->cap;
     }
     else
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
+                                                                                                                             
+        /* HV */
+        hard_virt = (op->u.credit.weight >> 15) & 0x1;
+        op->u.credit.weight &= 0x7fff;
+                                                                                                                             
+        printk("Weight assignment %u - w %u h_virt %u\n", d->domain_id,
+                   op->u.credit.weight, hard_virt);
+        if (hard_virt != atomic_read(&d->hard_virt))
+        {
+           if (!hard_virt)
+           {
+               /* This will convert a hard-virt to virt - This really shouldn't fail */
+               printk("Taking down hard-virt %u\n", d->domain_id);
+               spin_lock(&csched_priv.hard_virt_lock);
+               for (vcpu=0; vcpu < MAX_VIRT_CPUS; vcpu++)
+               {
+                  if (d->vcpu[vcpu] == NULL)
+                     break;
+                  if ( atomic_read( &(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu) ) )
+                  {
+                         atomic_set(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu), 0);
+                         CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu_state_change = 1;
+                  }
+                  total_hard_virts--;
+               }
+               atomic_set(&d->hard_virt, 0);
+               spin_unlock(&csched_priv.hard_virt_lock);
+               if (total_hard_virts < 0){
+                  printk("total_hard_virts less than 0!!\n");
+                  total_hard_virts = 0;
+               }
+                                                                                                                             
+           }
+           else
+           {
+               /* This will convert the virt into a hard-virt - If this fails, the entire operation fails */
+                                                                                                                             
+               /* Hard Virt conversion is made atomic with respect to hardvirt destruction code path using a spinlock  */
+               printk("Creating Hard-Virt %u\n", d->domain_id);
+               if (sdom->cap != 0U)
+               {
+                   return -0xDEAD;
+               }
+               if (d->domain_id == 0)
+               {
+                   return -0xDEAD;
+               }
+                                                                                                                             
+               spin_lock(&csched_priv.hard_virt_lock);
+               vcpus_in_domain = find_vcpu_count(d);
+               hard_cpus_available = find_available_online_cpus(vcpus_in_domain);
+               printk("to convert %d - available %d \n", vcpus_in_domain, hard_cpus_available);
+               if (vcpus_in_domain > hard_cpus_available)
+               {
+                   spin_unlock(&csched_priv.hard_virt_lock);
+                   return -0xDEAD;
+               }
+               atomic_set(&d->hard_virt, 1);
+               for (vcpu=0; vcpu < MAX_VIRT_CPUS; vcpu++)
+               {
+                  if (d->vcpu[vcpu] == NULL)
+                     break;
+                  if ( atomic_read(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu)) )
+                  {
+                     spin_unlock(&csched_priv.hard_virt_lock);
+                     printk("Vcpu %d already has a pcpu assigned - Aborting half way through.. \n", vcpu);
+                     atomic_set(&d->hard_virt, 0);
+                     return -0xDEAD;
+                  }
+                  atomic_set(&(CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu), 1);
+                  CSCHED_VCPU(d->vcpu[vcpu])->hard_virt_pcpu_state_change = 1;
+                  total_hard_virts++;
+               }
+               spin_unlock(&csched_priv.hard_virt_lock);
+                                                                                                                             
+                                                                                                                             
+           }
+                                                                                                                             
+        }
 
         spin_lock_irqsave(&csched_priv.lock, flags);
 
@@ -726,7 +988,7 @@ csched_dom_cntl(
             sdom->weight = op->u.credit.weight;
         }
 
-        if ( op->u.credit.cap != (uint16_t)~0U )
+        if ( op->u.credit.cap != (uint16_t)~0U &&  !atomic_read(&d->hard_virt))
             sdom->cap = op->u.credit.cap;
 
         spin_unlock_irqrestore(&csched_priv.lock, flags);
@@ -783,6 +1045,7 @@ csched_runq_sort(unsigned int cpu)
     struct csched_vcpu *svc_elem;
     unsigned long flags;
     int sort_epoch;
+    int credit;
 
     sort_epoch = csched_priv.runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
@@ -801,7 +1064,7 @@ csched_runq_sort(unsigned int cpu)
         next = elem->next;
         svc_elem = __runq_elem(elem);
 
-        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
+        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER || svc_elem->pri == CSCHED_PRI_RR )
         {
             /* does elem need to move up the runq? */
             if ( elem->prev != last_under )
@@ -815,6 +1078,56 @@ csched_runq_sort(unsigned int cpu)
         elem = next;
     }
 
+    elem = runq->next;
+    last_under = runq;
+                                                                                                                             
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+        if (svc_elem->pri != CSCHED_PRI_TS_UNDER && svc_elem->pri != CSCHED_PRI_RR)
+            break;
+        credit = atomic_read (&svc_elem->credit);
+                                                                                                                             
+        if ( credit >= CSCHED_CREDITS_PER_TSLICE/2 )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+                                                                                                                             
+        elem = next;
+    }
+    /* HV - TODO - This sucks - 3 scans !! - Old-fashioned bubble sort is
+          likely to be no worse in most cases - Consider a rewrite */
+    elem = runq->next;
+    last_under = runq;
+                                                                                                                             
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+        if (svc_elem->pri != CSCHED_PRI_TS_UNDER && svc_elem->pri != CSCHED_PRI_RR)
+            break;
+        if ( svc_elem->pri == CSCHED_PRI_RR )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+                                                                                                                             
+        elem = next;
+    }
+
+
     spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
 }
 
@@ -835,6 +1148,8 @@ csched_acct(void)
     int credit_balance;
     int credit_xtra;
     int credit;
+    uint32_t max_credit;
+    int credit_prev, credit_real_incr;
 
 
     spin_lock_irqsave(&csched_priv.lock, flags);
@@ -945,8 +1260,34 @@ csched_acct(void)
             BUG_ON( sdom != svc->sdom );
 
             /* Increment credit */
-            atomic_add(credit_fair, &svc->credit);
             credit = atomic_read(&svc->credit);
+            credit_prev = credit;
+            credit_real_incr = svc->credit_real_incr;
+ 
+            if (credit <= 0)
+                credit += credit_fair;
+            else
+            {
+                if ( sdom->cap != 0U )
+                {
+                   if (!vcpu_runnable(svc->vcpu))
+                   {
+                        credit = credit/2;
+                        if (credit > credit_fair/2)
+                             credit = credit_fair/2;
+                   }
+                }
+                /* If this earned fair share of credits last time
+                     then allow rollover credits */
+                if ( credit_real_incr > credit_fair )
+                {
+                      credit -= credit_real_incr - credit_fair;
+                      if (credit < 0)
+                           credit = 0;
+                }
+                credit += credit_fair;
+            }
+            atomic_set(&svc->credit, credit);
 
             /*
              * Recompute priority or, if VCPU is idling, remove it from
@@ -954,29 +1295,33 @@ csched_acct(void)
              */
             if ( credit < 0 )
             {
-                svc->pri = CSCHED_PRI_TS_OVER;
-
-                /* Park running VCPUs of capped-out domains */
-                if ( sdom->cap != 0U &&
-                     credit < -credit_cap &&
-                     !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+                if (svc->pri != CSCHED_PRI_RR)
                 {
-                    CSCHED_STAT_CRANK(vcpu_park);
-                    vcpu_pause_nosync(svc->vcpu);
-                    svc->flags |= CSCHED_FLAG_VCPU_PARKED;
-                }
+       		     svc->pri = CSCHED_PRI_TS_OVER;
+
+                     /* Park running VCPUs of capped-out domains */
+                     if ( sdom->cap != 0U &&
+                         credit < -credit_cap &&
+                         !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+                     {
+                     CSCHED_STAT_CRANK(vcpu_park);
+                     vcpu_pause_nosync(svc->vcpu);
+                     svc->flags |= CSCHED_FLAG_VCPU_PARKED;
+                     }
+                 }
 
                 /* Lower bound on credits */
-                if ( credit < -CSCHED_CREDITS_PER_TSLICE )
+                if ( credit < -(CSCHED_CREDITS_PER_TSLICE<<1) )
                 {
                     CSCHED_STAT_CRANK(acct_min_credit);
-                    credit = -CSCHED_CREDITS_PER_TSLICE;
+                    credit = -(CSCHED_CREDITS_PER_TSLICE<<1);
                     atomic_set(&svc->credit, credit);
                 }
-            }
+	    }
             else
             {
-                svc->pri = CSCHED_PRI_TS_UNDER;
+                if (svc->pri != CSCHED_PRI_RR)
+			svc->pri = CSCHED_PRI_TS_UNDER;
 
                 /* Unpark any capped domains whose credits go positive */
                 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
@@ -992,17 +1337,25 @@ csched_acct(void)
                 }
 
                 /* Upper bound on credits means VCPU stops earning */
-                if ( credit > CSCHED_CREDITS_PER_TSLICE )
-                {
+                max_credit = (credit_fair << 1) + credit_fair;
+                if (max_credit > 3*CSCHED_CREDITS_PER_TSLICE/2)
+                    max_credit = 3*CSCHED_CREDITS_PER_TSLICE/2;
+                else if (max_credit < CSCHED_CREDITS_PER_TSLICE/2)
+                    max_credit = CSCHED_CREDITS_PER_TSLICE/2;
+                if ( credit > max_credit ){
+                    credit = max_credit;
                     __csched_vcpu_acct_stop_locked(svc);
-                    credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
+
             }
 
             CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
             CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
+            svc->credit_real_incr = credit - credit_prev;
             credit_balance += credit;
+            if (credit_fair > svc->credit_real_incr)
+                credit_total += credit_fair - svc->credit_real_incr;
         }
     }
 
@@ -1048,18 +1401,21 @@ csched_tick(void *_cpu)
      * once per accounting period (currently 30 milliseconds).
      */
     csched_runq_sort(cpu);
+    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
 
 static struct csched_vcpu *
-csched_runq_steal(int peer_cpu, int cpu, int pri)
+csched_runq_steal(int peer_cpu, int cpu, int pri, int credit)
 {
     const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
     const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
     struct csched_vcpu *speer;
     struct list_head *iter;
     struct vcpu *vc;
+    int speer_credit;
+
 
     /*
      * Don't steal from an idle CPU's runq because it's about to
@@ -1075,8 +1431,10 @@ csched_runq_steal(int peer_cpu, int cpu, int pri)
              * If next available VCPU here is not of strictly higher
              * priority than ours, this PCPU is useless to us.
              */
-            if ( speer->pri <= pri )
-                break;
+            speer_credit = atomic_read(&speer->credit);
+            if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri < pri
+                || (speer->pri == pri && speer_credit <= (credit+(CSCHED_CREDITS_PER_TSLICE>>3)) ) )
+                 break;
 
             /* Is this VCPU is runnable on our PCPU? */
             vc = speer->vcpu;
@@ -1099,11 +1457,12 @@ csched_runq_steal(int peer_cpu, int cpu, int pri)
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(int cpu, struct csched_vcpu *snext, int credit)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
     int peer_cpu;
+    int repeat_count = 15, lock_failure_flag = 0;
 
     BUG_ON( cpu != snext->vcpu->processor );
 
@@ -1114,6 +1473,7 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
     else
         CSCHED_STAT_CRANK(load_balance_other);
 
+  spinLockRetry:
     /*
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
@@ -1137,23 +1497,156 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
         if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
         {
             CSCHED_STAT_CRANK(steal_trylock_failed);
+	    lock_failure_flag = 1;
             continue;
         }
 
         /*
          * Any work over there to steal?
          */
-        speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
+        speer = csched_runq_steal(peer_cpu, cpu, snext->pri, credit);
         spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
         if ( speer != NULL )
             return speer;
     }
+   
+    if ( lock_failure_flag && snext->pri == CSCHED_PRI_IDLE && repeat_count > 1 )
+    {
+        lock_failure_flag = 0;
+        repeat_count--;
+        goto spinLockRetry;
+     }
 
     /* Failed to find more important work elsewhere... */
     __runq_remove(snext);
     return snext;
 }
 
+static struct csched_vcpu *
+csched_runq_rr_steal(int peer_cpu, int cpu)
+{
+    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
+    const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
+    struct csched_vcpu *speer;
+    struct list_head *iter;
+    struct vcpu *vc;
+
+
+    /*
+     * Don't steal from an idle CPU's runq because it's about to
+     * pick up work from it itself.
+     */
+    if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
+    {
+        list_for_each( iter, &peer_pcpu->runq )
+        {
+            speer = __runq_elem(iter);
+
+            /** If next available VCPU here is not of strictly higher
+             * priority than ours, this PCPU is useless to us.
+             */
+            if ( speer->pri < CSCHED_PRI_RR )
+                 break;
+
+            /* Is this VCPU is runnable on our PCPU? */
+            vc = speer->vcpu;
+            BUG_ON( is_idle_vcpu(vc) );
+
+            if (__csched_vcpu_is_migrateable(vc, cpu))
+            {
+                /* We got a candidate. Grab it! */
+                CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+                CSCHED_STAT_CRANK(migrate_queued);
+                __runq_remove(speer);
+                vc->processor = cpu;
+                return speer;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+static struct csched_vcpu *
+csched_rr_load_balance(int cpu, struct csched_vcpu *snext)
+{
+    struct csched_vcpu *speer;
+    cpumask_t workers;
+    int peer_cpu;
+    int repeat_count = 15, lock_failure_flag = 0;
+
+    BUG_ON( cpu != snext->vcpu->processor );
+
+  spinLockRetry:
+
+    cpus_and(workers, cpu_online_map, cpu_online_map);
+    cpu_clear(cpu, workers);
+    peer_cpu = cpu;
+
+    while ( !cpus_empty(workers) )
+    {
+        peer_cpu = __cycle_cpu(peer_cpu, &workers);
+        cpu_clear(peer_cpu, workers);
+
+        /*
+         * Get ahold of the scheduler lock for this peer CPU.
+         *
+         * Note: We don't spin on this lock but simply try it. Spinning could
+         * cause a deadlock if the peer CPU is also load balancing and trying
+         * to lock this CPU.
+         */
+        if ( !cpu_isset(peer_cpu, csched_priv.hard_virt_multiple))
+            continue;
+
+        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
+        {
+            CSCHED_STAT_CRANK(steal_trylock_failed);
+            lock_failure_flag = 1;
+            continue;
+        }
+
+        /*
+         * Any work over there to steal?
+         */
+        speer = csched_runq_rr_steal(peer_cpu, cpu);
+        spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+        if ( speer != NULL )
+        {
+            CSCHED_STAT_CRANK(rt_vcpu_migrate);
+            return speer;
+        }
+    }
+
+    if ( lock_failure_flag && snext->pri < CSCHED_PRI_RR && repeat_count > 1 )
+    {
+        lock_failure_flag = 0;
+        repeat_count--;
+        goto spinLockRetry;
+     }
+
+    /* Failed to find more important work elsewhere... */
+    __runq_remove(snext);
+    return snext;
+}
+
+                                                                                                                            
+ 
+static struct csched_vcpu * __runq_find_dom0_vcpu(int cpu)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+                                                                                                                             
+    list_for_each( iter, runq )
+    {
+        struct csched_vcpu * iter_svc = __runq_elem(iter);
+        if (iter_svc->pri <= CSCHED_PRI_IDLE)
+            break;
+        if (iter_svc->vcpu->domain->domain_id == 0)
+            return iter_svc;
+    }
+   return NULL;
+}
+
 /*
  * This function is in the critical path. It is designed to be simple and
  * fast for the common case.
@@ -1166,6 +1659,8 @@ csched_schedule(s_time_t now)
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
     struct csched_vcpu *snext;
     struct task_slice ret;
+    int credit;
+    struct csched_vcpu *temp_snext;
 
     CSCHED_STAT_CRANK(schedule);
     CSCHED_VCPU_CHECK(current);
@@ -1173,11 +1668,26 @@ csched_schedule(s_time_t now)
     /*
      * Select next runnable local VCPU (ie top of local runq)
      */
+    if (opt_hardvirt && current->domain->domain_id == 0 && vcpu_runnable(current))
+    {
+        snext = scurr;
+        goto dom0_bypass;
+    }
+
     if ( vcpu_runnable(current) )
         __runq_insert(cpu, scurr);
     else
         BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
 
+    if (opt_hardvirt &&  NUMBER_DOM0_VCPUS_PRESENT(cpu) > 0)
+    {
+        snext = __runq_find_dom0_vcpu(cpu);
+        if (snext){
+             __runq_remove(snext);
+             goto dom0_bypass;
+        }
+    }
+
     snext = __runq_elem(runq->next);
 
     /*
@@ -1188,10 +1698,31 @@ csched_schedule(s_time_t now)
      * urgent work... If not, csched_load_balance() will return snext, but
      * already removed from the runq.
      */
-    if ( snext->pri > CSCHED_PRI_TS_OVER )
+    /* HV - hard_virt_multiple might report false positive if a RR vcpu was put to sleep when
+       it was in the runq or migrated off- Acceptable tradeoff for overhead of updating
+       maps at sleep/wakeup points.
+       Since hard_virt_multiple for self isn't updated at this point, there is a
+       very small chance of false positive from self - HV */
+    if ( snext->pri < CSCHED_PRI_RR && !cpus_empty(csched_priv.hard_virt_multiple) )
+    {
+        CSCHED_STAT_CRANK(rt_imbalance);
+        temp_snext = csched_rr_load_balance(cpu, snext);
+        if (temp_snext){
+             snext = temp_snext;
+             goto dom0_bypass;
+        }
+    }
+                                                                                                                             
+    credit = atomic_read(&snext->credit);
+    if ( snext->pri > CSCHED_PRI_TS_OVER && credit > (CSCHED_CREDITS_PER_TSLICE >> 2))
         __runq_remove(snext);
-    else
-        snext = csched_load_balance(cpu, snext);
+    else{
+        if (snext->pri <= CSCHED_PRI_IDLE)
+              credit = -(CSCHED_CREDITS_PER_TSLICE<<1);
+        snext = csched_load_balance(cpu, snext, credit);
+    }
+
+  dom0_bypass:
 
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
@@ -1206,6 +1737,22 @@ csched_schedule(s_time_t now)
     {
         cpu_clear(cpu, csched_priv.idlers);
     }
+    if ( snext->pri == CSCHED_PRI_RR )
+    {
+        if ( cpu_isset(cpu, csched_priv.hard_virt_none) )
+             cpu_clear(cpu, csched_priv.hard_virt_none);
+        if (!list_empty(runq) && __runq_elem(runq->next)->pri == CSCHED_PRI_RR)
+             cpu_set(cpu, csched_priv.hard_virt_multiple);
+        else
+             cpu_clear(cpu, csched_priv.hard_virt_multiple);
+    }
+    else
+    {
+        if (!cpu_isset(cpu, csched_priv.hard_virt_none))
+             cpu_set(cpu, csched_priv.hard_virt_none);
+        if (cpu_isset(cpu, csched_priv.hard_virt_multiple))
+             cpu_clear(cpu, csched_priv.hard_virt_multiple);
+    }
 
     /*
      * Return task to run next...
@@ -1231,7 +1778,7 @@ csched_dump_vcpu(struct csched_vcpu *svc)
 
     if ( sdom )
     {
-        printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
+        printk(" credit=%i of %d [w=%u]", atomic_read(&svc->credit), svc->credit_real_incr, sdom->weight);
 #ifdef CSCHED_STATS
         printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
                 svc->stats.credit_last,
@@ -1257,10 +1804,11 @@ csched_dump_pcpu(int cpu)
     spc = CSCHED_PCPU(cpu);
     runq = &spc->runq;
 
-    printk(" sort=%d, sibling=0x%lx, core=0x%lx\n",
+    printk(" sort=%d, sibling=0x%lx, core=0x%lx dom0=%u\n",
             spc->runq_sort_last,
             cpu_sibling_map[cpu].bits[0],
-            cpu_core_map[cpu].bits[0]);
+            cpu_core_map[cpu].bits[0],
+	    NUMBER_DOM0_VCPUS_PRESENT(cpu));
 
     /* current VCPU */
     svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
@@ -1313,6 +1861,8 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT);
 
     printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
+    printk("hard_virt_none: 0x%lx\n", csched_priv.hard_virt_none.bits[0]);
+    printk("hard_virt_multiple: 0x%lx\n", csched_priv.hard_virt_multiple.bits[0]);
 
     CSCHED_STATS_PRINTK();
 
@@ -1346,6 +1896,9 @@ csched_init(void)
     csched_priv.credit = 0U;
     csched_priv.credit_balance = 0;
     csched_priv.runq_sort = 0U;
+    spin_lock_init(&csched_priv.hard_virt_lock); /* HV */
+    cpus_clear(csched_priv.hard_virt_none);
+    cpus_clear(csched_priv.hard_virt_multiple);
     CSCHED_STATS_RESET();
 }
 
diff --git a/include/xen/sched.h b/include/xen/sched.h
index f3f36e8..38ccc1d 100644
--- a/include/xen/sched.h
+++ b/include/xen/sched.h
@@ -227,6 +227,9 @@ struct domain
     int32_t time_offset_seconds;
 
     struct rcu_head rcu;
+    /* HV */
+    atomic_t hard_virt;
+
 };
 
 struct domain_setup_info