Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 2802

kernel-2.6.18-128.1.10.el5.src.rpm

From: Rik van Riel <riel@redhat.com>
Date: Thu, 13 Dec 2007 14:18:37 -0500
Subject: [xen] hv: cpu frequency scaling
Message-id: 20071213141837.6b1f09f0@cuia.boston.redhat.com
O-Subject: [RHEL 5.2 PATCH 1/3] cpu frequency scaling for Xen BZ#251969
Bugzilla: 251969

Xen CPU frequency change support patch for RHEL 5.2, based
on the following upstream changesets:

continue_hypercall_on_cpu (only, did not get rest of changeset)
	15616:858b9bc8d0e6
	15624:bb5c23bbc7b7

cpufreq: Support cpufreq updates on AMD hardware by dom0 kernel.
	15924:2477e94450aa

x86: Fix time going backwards on CPU frequency change.
	b3814860d170b29daa8ee79eec3a6de603c68b9d

Add XENPF_getidletime.
	16000:50edcaff5520

Allow dom0 kernel to govern cpufreq via Intel Enhanced Speedstop MSR
	16202:c05ec22a9106

x86, dom0: Allow get idle time stats by mask.
	16203:4393255607be

Fix xenctl_cpumap_to_cpumask.
	16519:62451388f630

Acked-by: Bill Burns <bburns@redhat.com>
Acked-by: "Stephen C. Tweedie" <sct@redhat.com>

diff --git a/arch/x86/domain.c b/arch/x86/domain.c
index dbaf67b..9a0ba98 100644
--- a/arch/x86/domain.c
+++ b/arch/x86/domain.c
@@ -1346,6 +1346,65 @@ void sync_vcpu_execstate(struct vcpu *v)
     flush_tlb_mask(v->vcpu_dirty_cpumask);
 }
 
+struct migrate_info {
+    long (*func)(void *data);
+    void *data;
+    void (*saved_schedule_tail)(struct vcpu *);
+    cpumask_t saved_affinity;
+};
+
+static void continue_hypercall_on_cpu_helper(struct vcpu *v)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct migrate_info *info = v->arch.continue_info;
+
+    regs->eax = info->func(info->data);
+
+    v->arch.schedule_tail = info->saved_schedule_tail;
+    v->arch.continue_info = NULL;
+
+    xfree(info);
+
+    vcpu_set_affinity(v, &v->cpu_affinity);
+    schedule_tail(v);
+}
+
+int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
+{
+    struct vcpu *v = current;
+    struct migrate_info *info;
+    cpumask_t mask = cpumask_of_cpu(cpu);
+    int rc;
+
+    if ( cpu == smp_processor_id() )
+        return func(data);
+
+    info = xmalloc(struct migrate_info);
+    if ( info == NULL )
+        return -ENOMEM;
+
+    info->func = func;
+    info->data = data;
+    info->saved_schedule_tail = v->arch.schedule_tail;
+    info->saved_affinity = v->cpu_affinity;
+
+    v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
+    v->arch.continue_info = info;
+
+    rc = vcpu_set_affinity(v, &mask);
+    if ( rc )
+    {
+        v->arch.schedule_tail = info->saved_schedule_tail;
+        v->arch.continue_info = NULL;
+        xfree(info);
+        return rc;
+    }
+
+    /* Dummy return value will be overwritten by new schedule_tail. */
+    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
+    return 0;
+}
+
 #define next_arg(fmt, args) ({                                              \
     unsigned long __arg;                                                    \
     switch ( *(fmt)++ )                                                     \
diff --git a/arch/x86/platform_hypercall.c b/arch/x86/platform_hypercall.c
index 110fc70..eef717c 100644
--- a/arch/x86/platform_hypercall.c
+++ b/arch/x86/platform_hypercall.c
@@ -34,10 +34,17 @@ DEFINE_SPINLOCK(xenpf_lock);
 # define copy_from_compat copy_from_guest
 # undef copy_to_compat
 # define copy_to_compat copy_to_guest
+# undef guest_from_compat_handle
+# define guest_from_compat_handle(x,y) ((x)=(y))
 #else
 extern spinlock_t xenpf_lock;
 #endif
 
+static long cpu_frequency_change_helper(void *data)
+{
+    return cpu_frequency_change(*(uint64_t *)data);
+}
+
 ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
 {
     ret_t ret = 0;
@@ -258,11 +265,70 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
     break;
 #endif
 
+    case XENPF_change_freq:
+        ret = -ENOSYS;
+        if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            break;
+        ret = -EINVAL;
+        if ( op->u.change_freq.flags != 0 )
+            break;
+        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu,
+                                        cpu_frequency_change_helper,
+                                        &op->u.change_freq.freq);
+        break;
+
+    case XENPF_getidletime:
+    {
+        uint32_t cpu;
+        uint64_t idletime, now = NOW();
+        struct vcpu *v;
+        struct xenctl_cpumap ctlmap;
+        cpumask_t cpumap;
+        XEN_GUEST_HANDLE(uint8_t) cpumap_bitmap;
+        XEN_GUEST_HANDLE(uint64_t) idletimes;
+
+        ret = -ENOSYS;
+        if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            break;
+
+        ctlmap.nr_cpus  = op->u.getidletime.cpumap_nr_cpus;
+        guest_from_compat_handle(cpumap_bitmap,
+                                 op->u.getidletime.cpumap_bitmap);
+        ctlmap.bitmap.p = cpumap_bitmap.p; /* handle -> handle_64 conversion */
+        xenctl_cpumap_to_cpumask(&cpumap, &ctlmap);
+        guest_from_compat_handle(idletimes, op->u.getidletime.idletime);
+
+        for_each_cpu_mask ( cpu, cpumap )
+        {
+            if ( (v = idle_vcpu[cpu]) != NULL )
+            {
+                idletime = v->runstate.time[RUNSTATE_running];
+                if ( v->is_running )
+                    idletime += now - v->runstate.state_entry_time;
+            }
+            else
+            {
+                idletime = 0;
+                cpu_clear(cpu, cpumap);
+            }
+
+            ret = -EFAULT;
+            if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) )
+                goto out;
+        }
+
+        op->u.getidletime.now = now;
+        cpumask_to_xenctl_cpumap(&ctlmap, &cpumap);
+        ret = copy_to_guest(u_xenpf_op, op, 1) ? -EFAULT : 0;
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
     }
 
+ out:
     spin_unlock(&xenpf_lock);
 
     return ret;
diff --git a/arch/x86/time.c b/arch/x86/time.c
index 62df2bc..5587683 100644
--- a/arch/x86/time.c
+++ b/arch/x86/time.c
@@ -725,6 +725,27 @@ void update_domain_wallclock_time(struct domain *d)
     spin_unlock(&wc_lock);
 }
 
+int cpu_frequency_change(u64 freq)
+{
+    struct cpu_time *t = &this_cpu(cpu_time);
+    u64 curr_tsc;
+
+    local_irq_disable();
+    rdtscll(curr_tsc);
+    t->local_tsc_stamp = curr_tsc;
+    t->stime_local_stamp = get_s_time();
+    t->stime_master_stamp = read_platform_stime();
+    set_time_scale(&t->tsc_scale, freq);
+    local_irq_enable();
+
+    /* A full epoch should pass before we check for deviation. */
+    set_timer(&t->calibration_timer, NOW() + EPOCH);
+    if ( smp_processor_id() == 0 )
+        platform_time_calibration();
+
+    return 0;
+}
+
 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
 void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base)
 {
@@ -869,12 +890,14 @@ static void local_time_calibration(void *unused)
            error_factor, calibration_mul_frac, tsc_shift);
 #endif
 
-    /* Record new timestamp information. */
+    /* Record new timestamp information, atomically w.r.t. interrupts. */
+    local_irq_disable();
     t->tsc_scale.mul_frac = calibration_mul_frac;
     t->tsc_scale.shift    = tsc_shift;
     t->local_tsc_stamp    = curr_tsc;
     t->stime_local_stamp  = curr_local_stime;
     t->stime_master_stamp = curr_master_stime;
+    local_irq_enable();
 
     update_vcpu_system_time(current);
 
diff --git a/arch/x86/traps.c b/arch/x86/traps.c
index 36d5b94..a85dff8 100644
--- a/arch/x86/traps.c
+++ b/arch/x86/traps.c
@@ -1723,10 +1723,22 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             v->arch.guest_context.gs_base_user = res;
             break;
 #endif
+        case MSR_K8_FIDVID_STATUS:
+        case MSR_K8_FIDVID_CTL:
+            if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
+                 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
+                 wrmsr_safe(regs->ecx, eax, edx) )
+                eax = 0;
+            break;
+        case MSR_IA32_PERF_CTL:
+            if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
+                 (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
+                 wrmsr_safe(regs->ecx, eax, edx) )
+                goto fail;
+            break;
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
                 break;
-
             if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
                  (eax != l) || (edx != h) )
                 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
@@ -1759,6 +1771,13 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             regs->edx = v->arch.guest_context.gs_base_user >> 32;
             break;
 #endif
+        case MSR_K8_FIDVID_CTL:
+        case MSR_K8_FIDVID_STATUS:
+            if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
+                 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
+                 rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
+                regs->eax = regs->edx = 0;
+            break;
         case MSR_EFER:
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
                 goto fail;
diff --git a/common/domctl.c b/common/domctl.c
index 5d29667..dfd54dd 100644
--- a/common/domctl.c
+++ b/common/domctl.c
@@ -43,7 +43,8 @@ void cpumask_to_xenctl_cpumap(
 
     bitmap_long_to_byte(bytemap, cpus_addr(*cpumask), NR_CPUS);
 
-    copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes);
+    if ( copy_bytes != 0 )
+        copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes);
 
     for ( i = copy_bytes; i < guest_bytes; i++ )
         copy_to_guest_offset(xenctl_cpumap->bitmap, i, &zero, 1);
@@ -55,15 +56,20 @@ void xenctl_cpumap_to_cpumask(
     unsigned int guest_bytes, copy_bytes;
     uint8_t bytemap[(NR_CPUS + 7) / 8];
 
+    if ( guest_handle_is_null(xenctl_cpumap->bitmap) )
+        return;
+
     guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8;
     copy_bytes  = min_t(unsigned int, guest_bytes, sizeof(bytemap));
 
-    cpus_clear(*cpumask);
-
-    if ( guest_handle_is_null(xenctl_cpumap->bitmap) )
-        return;
+    memset(bytemap, 0, sizeof(bytemap));
 
-    copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes);
+    if ( copy_bytes != 0 )
+    {
+        copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes);
+        if ( (xenctl_cpumap->nr_cpus & 7) && (guest_bytes <= sizeof(bytemap)) )
+            bytemap[guest_bytes-1] &= ~(0xff << (xenctl_cpumap->nr_cpus & 7));
+    }
 
     bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, NR_CPUS);
 }
diff --git a/common/schedule.c b/common/schedule.c
index 7911f5f..70d04ee 100644
--- a/common/schedule.c
+++ b/common/schedule.c
@@ -41,6 +41,17 @@ string_param("sched", opt_sched);
 static unsigned int opt_dom0_vcpus_pin;
 boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
 
+enum cpufreq_controller cpufreq_controller;
+static void __init setup_cpufreq_option(char *str)
+{
+    if ( !strcmp(str, "dom0-kernel") )
+    {
+        cpufreq_controller = FREQCTL_dom0_kernel;
+        opt_dom0_vcpus_pin = 1;
+    }
+}
+custom_param("cpufreq", setup_cpufreq_option);
+
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
 
 /* Various timer handlers. */
diff --git a/include/asm-x86/domain.h b/include/asm-x86/domain.h
index c60c1f0..6e00e4d 100644
--- a/include/asm-x86/domain.h
+++ b/include/asm-x86/domain.h
@@ -268,6 +268,9 @@ struct arch_vcpu
     void (*ctxt_switch_from) (struct vcpu *);
     void (*ctxt_switch_to) (struct vcpu *);
 
+    /* Record information required to continue execution after migration */
+    void *continue_info;
+
     /* Bounce information for propagating an exception to guest OS. */
     struct trap_bounce trap_bounce;
 
@@ -311,10 +314,13 @@ struct arch_vcpu
     unsigned long vcpu_info_mfn;
 } __cacheline_aligned;
 
-/* shorthands to improve code legibility */
+/* Shorthands to improve code legibility. */
 #define hvm_vmx         hvm_vcpu.u.vmx
 #define hvm_svm         hvm_vcpu.u.svm
 
+/* Continue the current hypercall via func(data) on specified cpu. */
+int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h
index b821173..bb224b1 100644
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -357,6 +357,9 @@ static inline void write_efer(__u64 val)
 #define MSR_K8_VM_CR			0xC0010114
 #define MSR_K8_VM_HSAVE_PA		0xC0010117
 
+#define MSR_K8_FIDVID_CTL		0xC0010041
+#define MSR_K8_FIDVID_STATUS		0xC0010042
+
 /* MSR_K8_VM_CR bits: */
 #define _K8_VMCR_SVME_DISABLE		4
 #define K8_VMCR_SVME_DISABLE		(1 << _K8_VMCR_SVME_DISABLE)
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h
index 85bc78b..464d5dd 100644
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -26,4 +26,6 @@ extern int time_resume(void);
 
 extern void init_percpu_time(void);
 
+int cpu_frequency_change(u64 freq);
+
 #endif /* __X86_TIME_H__ */
diff --git a/include/public/platform.h b/include/public/platform.h
index 139375c..0659b03 100644
--- a/include/public/platform.h
+++ b/include/public/platform.h
@@ -158,6 +158,41 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t);
 typedef struct xenpf_stratus_call xenpf_stratus_call_t;
 DEFINE_XEN_GUEST_HANDLE(xenpf_stratus_call_t);
 
+#define XENPF_change_freq         52
+struct xenpf_change_freq {
+    /* IN variables */
+    uint32_t flags; /* Must be zero. */
+    uint32_t cpu;   /* Physical cpu. */
+    uint64_t freq;  /* New frequency (Hz). */
+};
+typedef struct xenpf_change_freq xenpf_change_freq_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t);
+
+/*
+ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
+ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is
+ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
+ * bit set are written to. On return, @cpumap_bitmap is modified so that any
+ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
+ * cleared.
+ */
+#define XENPF_getidletime         53
+struct xenpf_getidletime {
+    /* IN/OUT variables */
+    /* IN: CPUs to interrogate; OUT: subset of IN which are present */
+    XEN_GUEST_HANDLE(uint8_t) cpumap_bitmap;
+    /* IN variables */
+    /* Size of cpumap bitmap. */
+    uint32_t cpumap_nr_cpus;
+    /* Must be indexable for every cpu in cpumap_bitmap. */
+    XEN_GUEST_HANDLE(uint64_t) idletime;
+    /* OUT variables */
+    /* System time when the idletime snapshots were taken. */
+    uint64_t now;
+};
+typedef struct xenpf_getidletime xenpf_getidletime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+
 struct xen_platform_op {
     uint32_t cmd;
     uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -169,6 +204,8 @@ struct xen_platform_op {
         struct xenpf_microcode_update  microcode;
         struct xenpf_platform_quirk    platform_quirk;
         struct xenpf_firmware_info     firmware_info;
+	struct xenpf_change_freq       change_freq;
+	struct xenpf_getidletime       getidletime;
 	struct xenpf_stratus_call      stratus_call;
         uint8_t                        pad[128];
     } u;
diff --git a/include/xen/sched.h b/include/xen/sched.h
index 703b339..1840c13 100644
--- a/include/xen/sched.h
+++ b/include/xen/sched.h
@@ -496,6 +496,10 @@ static inline void vcpu_unblock(struct vcpu *v)
 #define is_hvm_domain(d) ((d)->is_hvm)
 #define is_hvm_vcpu(v)   (is_hvm_domain(v->domain))
 
+extern enum cpufreq_controller {
+    FREQCTL_none, FREQCTL_dom0_kernel
+} cpufreq_controller;
+
 #endif /* __SCHED_H__ */
 
 /*